In [1]:
import numpy as np 
import pandas as pd 
import string 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
import nltk
nltk.download('vader_lexicon')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/abhilash/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
df = pd.read_csv('chrome_reviews.csv')
df.head()

Unnamed: 0,ID,Review URL,Text,Star,Thumbs Up,User Name,Developer Reply,Version,Review Date,App ID
0,3886,https://play.google.com/store/apps/details?id=...,This is very helpfull aap.,5,0,INDIAN Knowledge,,83.0.4103.106,2020-12-19,com.android.chrome
1,3887,https://play.google.com/store/apps/details?id=...,Good,3,2,Ijeoma Happiness,,85.0.4183.127,2020-12-19,com.android.chrome
2,3888,https://play.google.com/store/apps/details?id=...,Not able to update. Neither able to uninstall.,1,0,Priti D BtCFs-29,,85.0.4183.127,2020-12-19,com.android.chrome
3,3889,https://play.google.com/store/apps/details?id=...,Nice app,4,0,Ajeet Raja,,77.0.3865.116,2020-12-19,com.android.chrome
4,3890,https://play.google.com/store/apps/details?id=...,Many unwanted ads,1,0,Rams Mp,,87.0.4280.66,2020-12-19,com.android.chrome


In [3]:
df = df[["Text","Star"]]
df.head()

Unnamed: 0,Text,Star
0,This is very helpfull aap.,5
1,Good,3
2,Not able to update. Neither able to uninstall.,1
3,Nice app,4
4,Many unwanted ads,1


In [4]:
df = df.dropna().reset_index(drop=True)

In [5]:
new_words = ['app','chrome','google','apps','aap','apps','update','updated','browser']

In [6]:
import emoji 
import re 
def removeEmoji(text):
    return emoji.get_emoji_regexp().sub(r'',text)

def clean_text(text):
    text = removeEmoji(text)
    text = "".join([word.lower() for word in text if word not in string.punctuation and not word.isdigit()])
    text = word_tokenize(text)
    text = [word for word in text if word not in new_words]
    text = [WordNetLemmatizer().lemmatize(word) for word in text]
    text = " ".join(text)
    return text

In [7]:
df['TextClean'] = df['Text'].apply(lambda x:clean_text(x))
df.head()

Unnamed: 0,Text,Star,TextClean
0,This is very helpfull aap.,5,this is very helpfull
1,Good,3,good
2,Not able to update. Neither able to uninstall.,1,not able to neither able to uninstall
3,Nice app,4,nice
4,Many unwanted ads,1,many unwanted ad


In [8]:
sid = SentimentIntensityAnalyzer()
df['sentiment'] = df["TextClean"].apply(lambda x:sid.polarity_scores(x))
df = pd.concat([df.drop(['sentiment'],axis=1),df['sentiment'].apply(pd.Series)],axis=1)
df.head()

Unnamed: 0,Text,Star,TextClean,neg,neu,pos,compound
0,This is very helpfull aap.,5,this is very helpfull,0.0,1.0,0.0,0.0
1,Good,3,good,0.0,0.0,1.0,0.4404
2,Not able to update. Neither able to uninstall.,1,not able to neither able to uninstall,0.0,1.0,0.0,0.0
3,Nice app,4,nice,0.0,0.0,1.0,0.4215
4,Many unwanted ads,1,many unwanted ad,0.487,0.513,0.0,-0.2263


In [9]:
wrongReview = []
for i in df.pos:
    if i>0.4:
        wrongReview.append(True)
    else:
        wrongReview.append(False)
        
df['wrongReview'] = wrongReview
df.head()

Unnamed: 0,Text,Star,TextClean,neg,neu,pos,compound,wrongReview
0,This is very helpfull aap.,5,this is very helpfull,0.0,1.0,0.0,0.0,False
1,Good,3,good,0.0,0.0,1.0,0.4404,True
2,Not able to update. Neither able to uninstall.,1,not able to neither able to uninstall,0.0,1.0,0.0,0.0,False
3,Nice app,4,nice,0.0,0.0,1.0,0.4215,True
4,Many unwanted ads,1,many unwanted ad,0.487,0.513,0.0,-0.2263,False


In [10]:
df.wrongReview.value_counts()

False    3917
True     3286
Name: wrongReview, dtype: int64

In [11]:
cvt = CountVectorizer(tokenizer=word_tokenize)
clf = RandomForestClassifier()
pipe = make_pipeline(cvt,clf)
pipe.fit(df['TextClean'],df['wrongReview'])

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(tokenizer=<function word_tokenize at 0x7f8166413950>)),
                ('randomforestclassifier', RandomForestClassifier())])

In [12]:
import joblib
file = 'model.pkl'

joblib.dump(pipe,file)

['model.pkl']

In [13]:
model = joblib.load('model.pkl')

In [14]:
def predict_if_rating_is_wrong(Review,Rating):
    if Rating<=3:
        text = clean_text(Review)
        pred = model.predict([text])
        print(pred)

In [15]:
x = 'This is a amazing app'
sid.polarity_scores(x)

{'neg': 0.0, 'neu': 0.441, 'pos': 0.559, 'compound': 0.5859}

In [16]:
predict_if_rating_is_wrong(x,3)

[ True]
