# Importing libraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('review.csv',usecols=['reviewText','overall'])

In [3]:
df.head()

Unnamed: 0,reviewText,overall
0,"Not much to write about here, but it does exac...",5.0
1,The product does exactly as it should and is q...,5.0
2,The primary job of this device is to block the...,5.0
3,Nice windscreen protects my MXL mic and preven...,5.0
4,This pop filter is great. It looks and perform...,5.0


In [4]:
df['overall'].value_counts()

5.0    6938
4.0    2084
3.0     772
2.0     250
1.0     217
Name: overall, dtype: int64

# Cleaning dataset 

In [5]:
# Replace email addresses with 'email*
df['reviewText'] = df['reviewText'].str.replace(r'^.+@[^\.].*\. [a-z]{2,}$',
                                  'emailaddress')
                                             
#Replace URLs with 'webaddress"
df['reviewText'] = df['reviewText'].str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

                                             
#Replace money symbols with 'moneysymb* (£ can by typed with ALT key + 156)
df['reviewText'] = df['reviewText'].str.replace(r'£|\$','dollers')
                                             
#RepLace 10 digit phone numbers (formats include paranthesis, spaces,no spaces, dashes) with 'phonenumber
df['reviewText'] = df['reviewText'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumber')
                                             
                                             
#Replace numbers with numbr
df['reviewText'] = df['reviewText'].str.replace(r'\d+(\.\d+)?','numbr')

In [6]:
# Remove punctuation (^ = this symbol is mean not equals to)
df['reviewText'] = df['reviewText'].str.replace(r'[^\w\d\s]',' ')
                                           
# Replace whitespace between terms with a single space
df['reviewText'] = df['reviewText'].str.replace(r'\s+',' ')
                                           
# Remove Leading and trailing whitespace
df['reviewText'] = df['reviewText'].str.replace(r'^\s+|\s+?$',' ')

In [7]:
df['reviewText'].replace(np.NaN,df['reviewText'].mean,inplace=True)

In [8]:
df.head()

Unnamed: 0,reviewText,overall
0,Not much to write about here but it does exact...,5.0
1,The product does exactly as it should and is q...,5.0
2,The primary job of this device is to block the...,5.0
3,Nice windscreen protects my MXL mic and preven...,5.0
4,This pop filter is great It looks and performs...,5.0


# Applying machine learning algorithms 

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [10]:
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,5),analyzer='char')

In [11]:
X = tfidf.fit_transform(df['reviewText'].values.astype("U"))
y = df['overall']

In [12]:
X.shape,y.shape

((10261, 20000), (10261,))

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

In [14]:
X_train.shape

(8208, 20000)

In [15]:
clf=LinearSVC(C = 20,class_weight='balanced')
clf.fit(X_train,y_train)

In [16]:
y_pred = clf.predict(X_test)

In [17]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         1.0       0.36      0.21      0.26        39
         2.0       0.24      0.15      0.18        55
         3.0       0.21      0.25      0.23       134
         4.0       0.35      0.35      0.35       451
         5.0       0.77      0.78      0.78      1374

    accuracy                           0.62      2053
   macro avg       0.39      0.35      0.36      2053
weighted avg       0.62      0.62      0.62      2053



# Testing the application

In [18]:
x = 'this product is really bad. i do not like it'

vec = tfidf.transform([x])
clf.predict(vec)

array([1.])

In [19]:
x = 'this product is really good. i like it'

vec = tfidf.transform([x])
clf.predict(vec)

array([5.])

In [20]:
x = 'this product is good,but i dont want to buy it'

vec = tfidf.transform([x])
clf.predict(vec)

array([3.])