In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 0) #To display entire text content of a column
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'
from sklearn.model_selection import train_test_split
import os

In [5]:
os.chdir('D:\capstone_data')

In [8]:
#reading csv for each dataset
sentiment_140_df = pd.read_csv('processed_csv/sentiment140_processed.csv')
sentiment_140_neutlabels =  pd.read_csv('processed_csv/sentiment140_neutlabels.csv')
consumer_complaints_df = pd.read_csv('processed_csv/consumer_complaints_processed.csv')
amazonreviews_df = pd.read_csv('processed_csv/amazonreviews_extract.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [13]:
#Verifying duplicate rows within each df
print (f"Number of Duplicate rows in sentiment_140_df is {sentiment_140_df.duplicated().sum()} i.e. {round(sentiment_140_df.duplicated().mean(),2)*100}%\n")
print (f"Number of Duplicate rows in sentiment_140_neutlabels is {sentiment_140_neutlabels.duplicated().sum()} i.e. {round(sentiment_140_neutlabels.duplicated().mean(),2)*100}%\n")
print (f"Number of Duplicate rows in consumer_complaints_df is {consumer_complaints_df.duplicated().sum()} i.e. {round(consumer_complaints_df.duplicated().mean(),2)*100}%\n")
print (f"Number of Duplicate rows in amazonreviews_df is {amazonreviews_df.duplicated().sum()} i.e. {round(amazonreviews_df.duplicated().mean(),2)*100}%\n")

Number of Duplicate rows in sentiment_140_df is 16309 i.e. 1.0%

Number of Duplicate rows in sentiment_140_neutlabels is 0 i.e. 0.0%

Number of Duplicate rows in consumer_complaints_df is 4653 i.e. 2.0%

Number of Duplicate rows in amazonreviews_df is 480721 i.e. 43.0%



Except for sentiment_140_neutlabels, all datasets have duplicated rows. Percentage is however low, less than 3%, for sentiment 140 and consumer complaints. However, Amazon reviews dataset has 43% duplicates. Since reviews cannot be imputed and we anyways have more than 2 million data points for our model, we will just delete all duplicated rows.

In [15]:
#Drop duplicates
sentiment_140_df.drop_duplicates(inplace = True)
consumer_complaints_df.drop_duplicates(inplace = True)
amazonreviews_df.drop_duplicates(inplace = True)

In [16]:
#Verify
print (f"Number of Duplicate rows in sentiment_140_df is {sentiment_140_df.duplicated().sum()} i.e. {round(sentiment_140_df.duplicated().mean(),2)*100}%\n")
print (f"Number of Duplicate rows in sentiment_140_neutlabels is {sentiment_140_neutlabels.duplicated().sum()} i.e. {round(sentiment_140_neutlabels.duplicated().mean(),2)*100}%\n")
print (f"Number of Duplicate rows in consumer_complaints_df is {consumer_complaints_df.duplicated().sum()} i.e. {round(consumer_complaints_df.duplicated().mean(),2)*100}%\n")
print (f"Number of Duplicate rows in amazonreviews_df is {amazonreviews_df.duplicated().sum()} i.e. {round(amazonreviews_df.duplicated().mean(),2)*100}%\n")

Number of Duplicate rows in sentiment_140_df is 0 i.e. 0.0%

Number of Duplicate rows in sentiment_140_neutlabels is 0 i.e. 0.0%

Number of Duplicate rows in consumer_complaints_df is 0 i.e. 0.0%

Number of Duplicate rows in amazonreviews_df is 0 i.e. 0.0%



`amazonreviews_df` still does not have a label column rather than rating. All ratings below 3 will be labelled as 1 i.e. greivances and ratings from 3 through 5 will be labelled 0 i.e. non greivances.

In [21]:
#Creating labels for amazonreviews_df:
#distinct ratings count
amazonreviews_df['Overall'].value_counts()

5.0     244493
4.0     92245 
5.0     91604 
1.0     47403 
4.0     35060 
3.0     34820 
1.0     32773 
2.0     26018 
3.0     17512 
2.0     13876 
None    1     
Name: Overall, dtype: int64

It can be seen that same rating points are being treated as separate entities i.e. rating is probably being read as labels. This column has to be converted to numeric.

In [22]:
#Dropping None value
amazonreviews_df.dropna(axis = 0, inplace = True)

In [28]:
#Convert to float and verify
amazonreviews_df['Overall'] = amazonreviews_df['Overall'].astype('float')
amazonreviews_df['Overall'].value_counts()

5.0    336095
4.0    127303
1.0    80174 
3.0    52330 
2.0    39892 
Name: Overall, dtype: int64

In [35]:
#Create labels <= 2 = 1, >2 =0
amazonreviews_df['label'] = np.where(amazonreviews_df['Overall'] 
                                                      <= 2.0, 1, 0)
amazonreviews_df.tail(1)

Unnamed: 0,Content,Overall,label
1116493,"After looking over different brands of IP camera and finally found this, it looks great and the spec of is better than many other IP camera at the same price.follow up:3/29/2014: after receiving the camera, I open up the box and the Camera looks nice.I set up the camera right out of box,the whole process was very easy, just follow the instruction that come with the camera. it took me a about 10 minutes to set every things up plus download the app on App Store. I control the camera with my Ipad 2 and it works great in the bright and dark. from this point I did not see any issue with this camera, but I will keep update my review about this product in future.",5.0,0


In [37]:
#Verify counts 0 = 336095 + 127303 + 52330 = 515,728, 1 = 120,066
amazonreviews_df['label'].value_counts()
#Counts match

0    515728
1    120066
Name: label, dtype: int64

In [38]:
#Drop overall column
amazonreviews_df.drop(columns = ['Overall'], inplace = True)

In [40]:
#Write to csv 
amazonreviews_df.to_csv('processed_csv/amazonreviews_with_label.csv', index = False)

In [43]:
amazonreviews_df.tail()

Unnamed: 0,Content,label
1116489,"The pen was not sent with accessories, such as the manual, and the connector for charging. Additional image of the video camera is not very good",1
1116490,"I just received this item and discovered it had no usb cable, meaning I can't charge it, and no 8gb memory card, which renders it useless even if I could charge it. So my actual review is zero stars, because this is what the advertising says, ""Spy Pen Comes w/ Everything You NEED In An Awesome Black Drawstring Pouch!""",1
1116491,"It looks grt, and could not wait for using it. I want to use it everyday, I think this is high lvl technology.",0
1116492,"Based on the specs and description, this camera should be very easy to set up, and valuable to use for monitoring. I wanna try the motion detection alarm with photos I receive of activity it detects. You can't beat it for the price anywhere! Video quality looks excellent.",0
1116493,"After looking over different brands of IP camera and finally found this, it looks great and the spec of is better than many other IP camera at the same price.follow up:3/29/2014: after receiving the camera, I open up the box and the Camera looks nice.I set up the camera right out of box,the whole process was very easy, just follow the instruction that come with the camera. it took me a about 10 minutes to set every things up plus download the app on App Store. I control the camera with my Ipad 2 and it works great in the bright and dark. from this point I did not see any issue with this camera, but I will keep update my review about this product in future.",0


# Create master df
Total observations = 50,000
<br>0 = 25,000, 15,000 from sentiment 140, 10,000 from amazon reviews
<br>1 = 25,000, 15,000 from sentiment 140, 6000 from consumer complaints, 9000 from amazon reviews
<br> all observations from sentiment 140_neutlabels

In [None]:
Extract 