In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv('training.csv')

In [3]:
df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [4]:
def preprocess_data(df):
    # Remove package name as it's not relevant
    df = df.drop('package_name', axis=1)
    
    # Convert text to lowercase
    df['review'] = df['review'].str.strip().str.lower()
    return df

In [5]:
df = preprocess_data(df)

In [6]:
# Split into training and testing data
x = df['review']
y = df['polarity']
x, x_test, y, y_test = train_test_split(x,y, stratify=y, test_size=0.25, random_state=42)

In [7]:
# Vectorize text reviews to numbers
vec = CountVectorizer(stop_words='english')
x = vec.fit_transform(x).toarray()
x_test = vec.transform(x_test).toarray()

In [8]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
model.score(x_test, y_test)

0.8565022421524664

In [10]:
model.predict(vec.transform(['I love this app !!!']))

array([1])

In [11]:
# Save model
joblib.dump(model, 'model.pkl')

['model.pkl']