In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords


reviews_df=pd.read_csv('Amazon_Reviews.csv')
reviews_df.head(15)
reviews_df['Label']=reviews_df['Label'].map({'__label__2 ':1,'__label__1 ':0})

y=reviews_df['Label']
reviews_df.drop(columns='Label',axis=1,inplace=True)
X_train, X_test, y_train, y_test= train_test_split(reviews_df, y, test_size=0.2, random_state=42)


reviews_df

Unnamed: 0,Review
0,Stuning even for the non-gamer: This sound tr...
1,The best soundtrack ever to anything.: I'm re...
2,Amazing!: This soundtrack is my favorite musi...
3,Excellent Soundtrack: I truly like this sound...
4,"Remember, Pull Your Jaw Off The Floor After H..."
...,...
194,A Book That Is Worth a Second Look: This book...
195,Best game ever: This games makes even amazing...
196,Guitar in Absentia: With all due respect to a...
197,Stiff and Smells like drying paint: You get w...


In [13]:
X_test.shape

(40, 2)

In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer


tokenizer=RegexpTokenizer(r'\w+')
lemmatizer= WordNetLemmatizer()
stemmer=PorterStemmer()

def preprocessing(review):
    
    final_tokens=' '
    tokens=tokenizer.tokenize(review)
    pure_tokens=[token.lower() for token in tokens if token.lower() not in stopwords.words('english')]
    lemmas_tokens=[lemmatizer.lemmatize(pure_token) for pure_token in pure_tokens]
    
    final_tokens=final_tokens.join(lemmas_tokens)
    return final_tokens
X_train['Cleaned_text']=X_train['Review'].apply(preprocessing)
X_test['Cleaned_text']=X_test['Review'].apply(preprocessing)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer=TfidfVectorizer(stop_words='english',use_idf=True)
X_train_TfIdf=vectorizer.fit_transform(X_train['Cleaned_text'])
X_test_TfIdf=vectorizer.transform(X_test['Cleaned_text'])


In [4]:
pd.DataFrame(X_train_TfIdf.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2379,2380,2381,2382,2383,2384,2385,2386,2387,2388
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.705272,0.0,0.0,0.0,0.0,0.0,0.0,0.0
155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.metrics import confusion_matrix,roc_curve,roc_auc_score
import matplotlib.pyplot as plt

clf=MultinomialNB().fit(X_train_TfIdf,y_train)

y_pred=clf.predict(X_test_TfIdf.toarray())

confusion_matrix(y_test,y_pred)

y_proba_pred=clf.predict_proba(X_test_TfIdf.toarray())[::,1]

fpr,tpr,thresholds=roc_curve(y_test,y_proba_pred)

plt.plot(fpr,tpr)

plt.xlabel('FPR') 
plt.ylabel('TPR')
plt.title('auc-roc curve')
plt.legend(loc=6)
plt.show()
roc_s=roc_auc_score(y_test,y_proba_pred)
roc_s


No handles with labels found to put in legend.


<Figure size 640x480 with 1 Axes>

0.8533333333333333

In [12]:
arun_df=pd.DataFrame({'y_actual':y_test,'y_predicted':clf.predict(X_test_TfIdf.toarray())})
arun_df

Unnamed: 0,y_actual,y_predicted
82,1,1
15,0,0
111,1,1
177,1,1
76,0,0
163,1,1
68,1,1
67,1,1
120,1,1
173,1,1
