In [None]:
import warnings
warnings.filterwarnings("ignore")                   

import numpy as np                                  
import pandas as pd                                 
import nltk    

In [None]:
df = pd.read_csv('D:\Data Science Projects\Reviews.csv')

print(df.shape)
df.head()

In [None]:
df.columns

### Data Preparation

In [None]:
df['Helpful %'] = np.where(df['HelpfulnessDenominator'] > 0, df['HelpfulnessNumerator'] / df['HelpfulnessDenominator'], -1)

In [None]:
df.head()

### Assigning different different labels to helpful% according to its value

In [None]:
df['Helpful %'].unique()

In [None]:
pd.cut(df['Helpful %'] , bins = [-1, 0, 0.2, 0.4, 0.6, 0.8, 1.0], labels = ['Empty', '0-20%', '20-40%', '40-60%', '60-80%', '80-100%'])

In [None]:
df['%upvote'] = pd.cut( df['Helpful %'] , bins = [-1, 0, 0.2, 0.4, 0.6, 0.8, 1.0], labels = ['Empty', '0-20%', '20-40%', '40-60%', '60-80%', '80-100%'])

In [None]:
df.head()

In [None]:
df.groupby(['Score', '%upvote']).agg('count')

### Considering only Id Column, to count Total Upvotes for different different categories

In [None]:
df.groupby(['Score', '%upvote']).agg({'Id':'count'})

In [None]:
df_s=df.groupby(['Score', '%upvote']).agg({'Id':'count'}).reset_index()
df_s

### Create Pivot Table for better conclusion

In [None]:
df_s.pivot(index='%upvote',columns='Score')

### Create heatmap for better Visualisations

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(df_s.pivot(index='%upvote',columns='Score'),annot=True,cmap = 'YlGnBu')
plt.title('How helpful users find among user scores')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df['Score'].unique()

In [None]:
df2 = df[df['Score'] != 3]
X = df2['Text']
y_dict = {1:0, 2:0, 4:1, 5:1}
y = df2['Score'].map(y_dict)

### Convert text into vectors using NLP

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
c = CountVectorizer(stop_words = 'english')

In [None]:
X_c = c.fit_transform(X)

In [None]:
print('features: {}'.format(X_c.shape[1]))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_c, y)
print(' train records: {}'.format(X_train.shape[0]))

In [None]:
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()

In [None]:
ml =log.fit(X_train, y_train)
acc = ml.score(X_test, y_test)
print ('Model Accuracy: {}'.format(acc))

### Fetch Top 20 Positive & Top 20 negative words

In [None]:
w = c.get_feature_names()
w

In [None]:
coef = ml.coef_.tolist()[0]
coef

In [None]:
coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
coeff_df

In [None]:
coeff_df = coeff_df.sort_values(['Coefficient', 'Word'],ascending=False)
coeff_df

In [None]:
print('-Top 20 positive-')
print(coeff_df.head(20).to_string(index=False))
print('\n')
print('-Top 20 negative-')        
print(coeff_df.tail(20).to_string(index=False))

### Function to apply mutliple NLP Techniques + Multiple Ml algos to acheive best accuracy

In [None]:
def text_fit(X, y, nlp_model,ml_model,coef_show=1):
    
    X_c = nlp_model.fit_transform(X)
    print('features: {}'.format(X_c.shape[1]))
    X_train, X_test, y_train, y_test = train_test_split(X_c, y)
    print(' train records: {}'.format(X_train.shape[0]))
    print(' test records: {}'.format(X_test.shape[0]))
    ml =ml_model.fit(X_train, y_train)
    acc = ml.score(X_test, y_test)
    print ('Model Accuracy: {}'.format(acc))
    
    if coef_show == 1: 
        w = nlp_model.get_feature_names()
        coef = ml.coef_.tolist()[0]
        coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
        coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
        print('\n')
        print('-Top 20 positive-')
        print(coeff_df.head(20).to_string(index=False))
        print('\n')
        print('-Top 20 negative-')        
        print(coeff_df.tail(20).to_string(index=False))
    
    


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
c = CountVectorizer(stop_words = 'english')
from sklearn.linear_model import LogisticRegression

In [None]:
text_fit(X, y, c, LogisticRegression())

### Predict function

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
def predict(X, y, nlp_model,ml_model):
    
    X_c = nlp_model.fit_transform(X)
    print('features: {}'.format(X_c.shape[1]))
    X_train, X_test, y_train, y_test = train_test_split(X_c, y)
    print(' train records: {}'.format(X_train.shape[0]))
    print(' test records: {}'.format(X_test.shape[0]))
    ml =ml_model.fit(X_train, y_train)
    predictions=ml.predict(X_test)
    cm=confusion_matrix(predictions,y_test)
    print(cm)
    acc=accuracy_score(predictions,y_test)
    print(acc)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
c = CountVectorizer(stop_words = 'english')
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()

In [None]:
predict(X,y,c,lr)

### Accuracy is around 93.9%

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
text_fit(X, y, c, DummyClassifier(),0)

### Logistic regression model on TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words = 'english')
text_fit(X, y, tfidf, LogisticRegression())

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words = 'english')
predict(X, y, tfidf, LogisticRegression())

### Upvote prediction

In [None]:
data = df[df['Score'] == 5]

In [None]:
data.columns

In [None]:
data2 = data[data['%upvote'].isin(['0-20%', '20-40%', '60-80%', '80-100%'])]
data2.shape

In [None]:
X = data2['Text']

In [None]:
y_dict = {'0-20%': 0, '20-40%': 0, '60-80%': 1, '80-100%': 1}
y = data2['%upvote'].map(y_dict)

In [None]:
print(y.value_counts())

### The target class 'y' is highly skewed , observes that positive upvotes are too much higher than negative ones.
    

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf=TfidfVectorizer()

In [None]:
X_c=tf.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_c,y,train_size=0.7)

In [None]:
y_test.value_counts()

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
os =  RandomOverSampler()

In [None]:
X_train_res, y_train_res = os.fit_sample(X_c, y)

In [None]:
X_train_res.shape,y_train_res.shape

In [None]:
from collections import Counter

In [None]:
print('Original dataset shape {}'.format(Counter(y)))
print('Resampled dataset shape {}'.format(Counter(y_train_res)))

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_class=LogisticRegression()

In [None]:
grid={'C':10.0 **np.arange(-2,3),'penalty':['l1','l2']}

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV

In [None]:
clf=GridSearchCV(estimator=log_class,param_grid=grid,cv=5,n_jobs=-1,scoring='f1_macro')
clf.fit(X_train_res,y_train_res)

In [None]:
y_pred=clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))