<a href="https://colab.research.google.com/github/caprolaliac/MLPrac/blob/main/sentiment-analysis/SA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from wordcloud import WordCloud
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import pickle
import re
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df=pd.read_csv('/content/drive/MyDrive/proc/yelp.csv')
df.head(10)

Unnamed: 0,class_index,review_text
0,1,I got 'new' tires from them and within two wee...
1,1,Don't waste your time. We had two different p...
2,1,All I can say is the worst! We were the only 2...
3,1,I have been to this restaurant twice and was d...
4,1,Food was NOT GOOD at all! My husband & I ate h...
5,3,This is a tiny Starbucks and it locations like...
6,2,Typical Starbucks coffee chain. 2 things I don...
7,4,So.Much.Fun! \n\nI WISH I could play a song at...
8,4,"My friend is a piano teacher, so I took it as ..."
9,3,Stopped by on a Mon evening after trying to di...


In [None]:
def categorize_rating(rating):
    if rating in [4, 5]:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

df['category'] = df['class_index'].apply(categorize_rating)

In [None]:
stemmer = PorterStemmer()
STOPWORDS = set(stopwords.words('english'))

corpus = []
for i in range(0, df.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', df.iloc[i]['review_text'])
    review = review.lower().split()
    review = [stemmer.stem(word) for word in review if word not in STOPWORDS]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
df['processed_review'] = corpus

In [None]:
df.head()

Unnamed: 0,class_index,review_text,category,processed_review
0,1,I got 'new' tires from them and within two wee...,negative,got new tire within two week got flat took car...
1,1,Don't waste your time. We had two different p...,negative,wast time two differ peopl come hous give us e...
2,1,All I can say is the worst! We were the only 2...,negative,say worst peopl place lunch place freez load k...
3,1,I have been to this restaurant twice and was d...,negative,restaur twice disappoint time go back first ti...
4,1,Food was NOT GOOD at all! My husband & I ate h...,negative,food good husband ate coupl week ago first tim...


In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_review'])
y = df['class_index']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf=RandomForestClassifier()
rf

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 500, 1000],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced', 'balanced_subsample'],
    'ccp_alpha': [0.0, 0.1, 0.2]
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

n_iter_search = 10

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid,
                                   n_iter=n_iter_search, cv=3,
                                   verbose=2, random_state=42, n_jobs=-1,
                                   scoring='f1_macro')
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score: {:.2f}".format(random_search.best_score_))

In [None]:
best_rf = random_search.best_estimator_

In [None]:
import joblib
joblib.dump(best_rf, 'best_rf.joblib')

In [None]:
y_train_pred = best_rf.predict(X_train)
y_test_pred = best_rf.predict(X_test)

In [None]:
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

In [None]:
print(f"Accuracy: {train_acc:.4f}")
print(f"Accuracy: {test_acc:.4f}")

In [None]:
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_test_pred))