In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [38]:
df_train = pd.read_csv("/kaggle/input/amazon-reviews/train.csv" , header=None)
df_test = pd.read_csv('/kaggle/input/amazon-reviews/test.csv' , header = None)

In [39]:
df_train.columns = ['polarity', 'title', 'text']
df_test.columns = ['polarity', 'title', 'text']

In [40]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   polarity  int64 
 1   title     object
 2   text      object
dtypes: int64(1), object(2)
memory usage: 82.4+ MB


In [41]:
df_train.head()

Unnamed: 0,polarity,title,text
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [42]:
# polarity column description
# 1 == Negative
# 2 == Positive
df_train['polarity'].unique()

array([2, 1])

In [43]:
df_train['polarity'].value_counts()

polarity
2    1800000
1    1800000
Name: count, dtype: int64

In [44]:
# LOWER CASE THE DATA OF COLUMN 'title'

df_train['title'] = df_train['title'].str.lower()

In [45]:
# LOWER CASE THE DATA OF COLUMN 'text'

df_train['text'] = df_train['text'].str.lower()

In [46]:
df_train['title'].sample(10)

247688     ***** junk!! do not waste your money on this o...
1461988                                              perfect
1945871                                          bad machine
611514                                  wyatt earp's revenge
2658457                forcing myself to finish this one....
3270026                  too much talking, not enough action
661831                                  outstanding voice...
474590                                           a good yarn
758248                paranomal activity versus the moon men
13131                                         very enjoyable
Name: title, dtype: object

In [47]:
print("shape of df_train: ",df_train.shape)
print("shape of df_test: ",df_test.shape)

shape of df_train:  (3600000, 3)
shape of df_test:  (400000, 3)


In [48]:
# nan values in df_train
df_train.isna().sum()

polarity      0
title       207
text          0
dtype: int64

In [49]:
#  nan values in df_test
df_test.isna().sum()

polarity     0
title       24
text         0
dtype: int64

In [50]:
# drop nan values 
df_train = df_train.dropna()
df_test = df_test.dropna()

In [None]:
#  droping title as i think it is not needed for now 

df_train = df_train.drop(['title'], axis=1)
df_test = df_test.drop(['title'], axis=1)

In [52]:
import string 
translator = str.maketrans('','',string.punctuation)
digit = str.maketrans('','',string.digits)

In [53]:
# removing punctuations from title
# df_train['title'] = df_train['title'].apply(lambda x: x.translate(translator))

In [None]:
# removing punctuations from text
df_train['text'] = df_train['text'].apply(lambda x: x.translate(translator))

In [99]:
# removing punctuations from text
df_test['text'] = df_test['text'].apply(lambda x: x.translate(translator))

In [55]:
df_train.sample(10)

Unnamed: 0,polarity,text
1060301,1,setting aside a gaping hole or two in the plot...
212035,1,i originally saw this movie on showtime and lo...
2102725,1,my friend had told me how this movie was so go...
883235,2,if you are a fan of the original thx 1138 i th...
1454751,1,this would be a cute novelty item that is neve...
2909118,1,i really wish i would have looked more closely...
979412,2,i am a beginner and this book has been my bibl...
802554,1,leonards greatest strength has always been his...
423295,2,i have a license plate cover so when i mounted...
283218,2,an interesting if long read about a fascinatin...


In [56]:
# removing the numbers from the "title" 
# df_train['title'] = df_train['title'].apply(lambda x: x.translate(digit))

In [None]:
# removing the numbers from the "text"
df_train['text'] = df_train['text'].apply(lambda x: x.translate(digit))

In [100]:
# removing the numbers from the "text"
df_test['text'] = df_test['text'].apply(lambda x: x.translate(digit))

In [58]:
df_train.sample(10)

Unnamed: 0,polarity,text
2861833,2,hi this shark is awsome it includes drones th...
1868246,1,this is very poorly made i wanted to get one f...
660654,1,this dvd was sent and the audio worked fine bu...
43452,1,i ordered the softgel form of this pill becaus...
2353169,1,ive gone through several other beard trimmers ...
129274,1,i bought this toy for my daughter and it did n...
376525,1,i purchased this because i am an exercise begi...
2838943,2,my little girl loves this movie she would watc...
2434874,2,a remarkable clear and concise description of ...
2580648,2,i am a huge firm fan and workout for minutes ...


In [None]:
# removing extra spaces just in case 
# df_train['title'] = df_train['title'].str.strip()
df_train['text'] = df_train['text'].str.strip()


In [101]:

df_test['text'] = df_test['text'].str.strip()

In [60]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords  


In [61]:
stopword = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [64]:
from tqdm import tqdm
tqdm.pandas()

In [65]:
# removing stop words and also applying the tokenizations

def tokenization_and_stopwords_removal(text):
    tokenized_text = word_tokenize(text)
    texts = [word for word in tokenized_text if word.lower() not in stopword ]
    stemmed_text = [stemmer.stem(word) for word in texts]
    

    return " ".join(stemmed_text)

df_train['text'] = df_train['text'].progress_apply(tokenization_and_stopwords_removal)

100%|██████████| 3599793/3599793 [55:17<00:00, 1085.24it/s]


In [102]:
df_test['text'] = df_test['text'].progress_apply(tokenization_and_stopwords_removal)

100%|██████████| 399976/399976 [06:07<00:00, 1088.19it/s]


In [66]:
# length of the text (new feature)

def len_of_text(text):
    return len(text)

df_train['text length'] = df_train['text'].apply(len_of_text)

In [103]:
df_test['text length'] = df_test['text'].apply(len_of_text)

In [74]:

df_train.sample(10)

Unnamed: 0,polarity,text,text length
574111,2,bought book daughter bulli school thought woul...,449
2055890,1,excit thoughtprovok lifechang everyth radiohea...,507
90372,1,pdf copi book friend got end thought miss last...,265
3566544,1,hand worst coffe maker ever own conveni caraf ...,396
787605,2,copi book librari long interest occult asid cl...,491
3409375,2,pretti good sound consid fascin solo work gem ...,124
1871635,2,great,5
3099981,2,final book sit read child help understand ill ...,238
2939304,2,walt kelli pogo noth short extrodinari perfect...,400
60848,2,best thing movi tri convey must like aboard th...,315


In [78]:
from sklearn.feature_extraction.text import CountVectorizer
vectorize = CountVectorizer(max_features=5000)
X = vectorize.fit_transform(df_train['text'])

In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_tfid = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_tfid = vectorizer_tfid.fit_transform(df_train['text'])

In [85]:
print("df_train['polarity] ",  df_train['polarity'].value_counts())
print("")
print("df_test['polarity] ",  df_test['polarity'].value_counts())


df_train['polarity]  polarity
1    1799913
0    1799880
Name: count, dtype: int64

df_test['polarity]  polarity
2    199992
1    199984
Name: count, dtype: int64


In [96]:
print(f"df_test is null or not\n\n {df_test.isnull().sum()}")
print('')
print(f"df_train is null or not\n\n {df_train.isnull().sum()}")

df_test is null or not

 polarity    0
title       0
text        0
dtype: int64

df_train is null or not

 polarity       0
text           0
text length    0
dtype: int64


In [None]:
# now negative = 0, positive = 1 for simplicity

df_train['polarity'] = df_train['polarity'].map({2:1, 1:0})
df_test['polarity'] = df_test['polarity'].map({2:1,1:0})

In [104]:
df_train.to_csv('train.csv', index=False)
df_test.to_csv('test.csv', index=False)