In [20]:
import pandas as pd

In [21]:
data = pd.read_csv('preprocessed_data.csv')

In [58]:
data.isna().sum()

reviews      23
sentiment     0
dtype: int64

In [59]:
data = data.dropna()

In [61]:
data.isna().sum()

reviews      0
sentiment    0
dtype: int64

In [62]:
data = data[["reviews","sentiment"]]

In [63]:
data.head()

Unnamed: 0,reviews,sentiment
0,A fantastic overview and immersion into the wo...,1
1,Good course for learn the principles of oop wi...,1
2,Really a great course! it highlighted the most...,1
3,The course is simple and excellent for new lea...,1
4,"I learned a lot from feminism terms, I have an...",1


In [64]:
data['sentiment'].unique()

array([ 1, -1,  0], dtype=int64)

In [65]:
from sklearn.utils import resample
#create two different dataframe of majority and minority class 
df_majority = data[(data['sentiment']==1)] 
df_minority = data[(data['sentiment']!=1)] 
df_majority_dwnsampled = resample(df_majority, 
                                 replace=True,    
                                 n_samples= 81845, 
                                 random_state=42)  
# Combine majority class with upsampled minority class
df_dwnsampled = pd.concat([df_majority_dwnsampled, df_minority])

In [66]:
print("Number of rows per star rating")
print(data['sentiment'].value_counts())

Number of rows per star rating
 1    81840
 0    48292
-1    33535
Name: sentiment, dtype: int64


In [67]:
# function to split the dataset (70-30)trained-test 
from sklearn.model_selection import train_test_split
df_dwnsampled = df_dwnsampled.dropna()

In [68]:
x_train, x_test, y_train, y_test = train_test_split(df_dwnsampled['reviews'], df_dwnsampled['sentiment'], test_size = 0.3)

In [69]:
x_train.head()

90535         Pretty average. Check out Angela Yu on Udemy.
39945     A very comprehensive and practical course on h...
160598    I expected much more in this course. In my poi...
107933    This course was average. The first 2 weeks wer...
133387    THIS course is using only its own packages and...
Name: reviews, dtype: object

In [70]:
x_test.info()

<class 'pandas.core.series.Series'>
Int64Index: 49102 entries, 67262 to 81004
Series name: reviews
Non-Null Count  Dtype 
--------------  ----- 
49102 non-null  object
dtypes: object(1)
memory usage: 767.2+ KB


In [71]:
y_train.head()

90535     0
39945     1
160598    0
107933   -1
133387    0
Name: sentiment, dtype: int64

In [72]:
y_test.head()

67262    1
33326    1
50007    1
80335    1
22599    1
Name: sentiment, dtype: int64

In [73]:
from sklearn.feature_extraction.text import CountVectorizer #to conver letter into numbers
vectorizer = CountVectorizer(token_pattern= r'\b\w+\b')
train_matrix = vectorizer.fit_transform(x_train)
test_matrix = vectorizer.transform(x_test)

In [74]:
train_matrix #sparse matrix

<114570x41356 sparse matrix of type '<class 'numpy.int64'>'
	with 2784218 stored elements in Compressed Sparse Row format>

In [75]:
vectorizer #function

CountVectorizer(token_pattern='\\b\\w+\\b')

In [76]:
train_matrix.shape

(114570, 41356)

In [77]:
x_train = train_matrix
x_test = test_matrix

In [78]:
from sklearn.ensemble import RandomForestClassifier

In [79]:
rf = RandomForestClassifier()   # fun to obj

In [80]:
rf.fit(x_train, y_train) #training model reviews to label

RandomForestClassifier()

In [81]:
rf.score(x_test, y_test)


0.9321616227444911

In [82]:
from sklearn.metrics import classification_report

In [83]:
preds = rf.predict(x_test)

In [84]:
print(classification_report(preds,y_test))

              precision    recall  f1-score   support

          -1       0.90      0.96      0.93      9454
           0       0.91      0.90      0.90     14632
           1       0.96      0.94      0.95     25016

    accuracy                           0.93     49102
   macro avg       0.92      0.93      0.93     49102
weighted avg       0.93      0.93      0.93     49102



In [85]:
import pickle
pickle.dump(rf, open('model.pkl', 'wb'))

In [87]:
pickled_model = pickle.load(open('model.pkl', 'rb'))
pickled_model.predict(x_train)

array([ 0,  1,  0, ...,  1, -1,  1], dtype=int64)

In [88]:
pickle.dump(vectorizer, open("vector.pkl","wb"))