In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df = pd.read_csv('reviews.csv')

In [4]:
df.head()

Unnamed: 0,ProductID,ReviewID,Review
0,HT-1000,100000000,Three start. I have some issues like some keys...
1,HT-1001,100000001,Heating and not wake up. Check twice when buy ...
2,HT-1002,100000002,2nd hand laptops!!! The laptop I received alre...
3,HT-1003,100000003,Review: BEST LAPTOP IN A PRICE SEGMENT OF Rs 5...
4,HT-1010,100000004,Review: second hand product. don't buy this. I...


### step 1:
Converting raw text into a TF-IDF matrix.

In [5]:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(df['Review'])


In [6]:
print(x.toarray())
x.shape

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.25836133 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


(85, 537)

In [7]:
df_data = pd.DataFrame(x.toarray(), columns= vectorizer.get_feature_names_out())
df_data.head()

Unnamed: 0,10,2017,2nd,50k,55,55k,60k,90,able,abnormal,...,works,world,worst,worth,worthy,would,wrking,yes,you,your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.258361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.420583,0.0,0.420583,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Step 2:
Computing sentiment score for each review and creating a feature from the score.

In [8]:
from textblob import TextBlob

In [9]:
df['sentiment'] = df['Review'].apply(lambda review: TextBlob(review).sentiment.polarity)

In [10]:
df.head()

Unnamed: 0,ProductID,ReviewID,Review,sentiment
0,HT-1000,100000000,Three start. I have some issues like some keys...,-0.025
1,HT-1001,100000001,Heating and not wake up. Check twice when buy ...,0.0
2,HT-1002,100000002,2nd hand laptops!!! The laptop I received alre...,-0.042614
3,HT-1003,100000003,Review: BEST LAPTOP IN A PRICE SEGMENT OF Rs 5...,1.0
4,HT-1010,100000004,Review: second hand product. don't buy this. I...,0.0


In [11]:
print(df.groupby('sentiment').first())

          ProductID   ReviewID  \
sentiment                        
-1.0000     HT-1007  100000027   
-0.9550     HT-1020  100000005   
-0.7000     HT-1030  100000016   
-0.7000     HT-1010  100000014   
-0.6350     HT-1001  100000021   
...             ...        ...   
 0.8275     HT-1010  100000024   
 0.8500     HT-1050  100000044   
 0.8700     HT-1007  100000042   
 0.9000     HT-1000  100000020   
 1.0000     HT-1003  100000003   

                                                      Review  
sentiment                                                     
-1.0000    Review: Worst laptop ever not for student use ...  
-0.9550    Review: very bad. My product is not working wh...  
-0.7000    Review: It’s been just 2 months and the device...  
-0.7000    Review:  have had extremely bad experiences wi...  
-0.6350    Review: very poor. disappointed. Defective pro...  
...                                                      ...  
 0.8275    Review: Very good and light weight laptop.

In [52]:
sentiment_result = []
for i in range(len(df['sentiment'])):
    if (df['sentiment'][i]>=-1) & (df['sentiment'][i]<=-0.5):
        sentiment_result.append('Terrible')
    elif (df['sentiment'][i]>-0.5) & (df['sentiment'][i]<0):
        sentiment_result.append('Bad')
    elif (df['sentiment'][i]>=0) & (df['sentiment'][i]<=0.5):
        sentiment_result.append('Neutral')
    elif (df['sentiment'][i]>0.5) & (df['sentiment'][i]<0.8):
        sentiment_result.append('Good')
    elif (df['sentiment'][i]>=0.8) & (df['sentiment'][i]<=1):
        sentiment_result.append('Excellent')

df['sentiment_result'] = sentiment_result

In [53]:
df

Unnamed: 0,ProductID,ReviewID,Review,sentiment,sentiment_result
0,HT-1000,100000000,Three start. I have some issues like some keys...,-0.025000,Bad
1,HT-1001,100000001,Heating and not wake up. Check twice when buy ...,0.000000,Neutral
2,HT-1002,100000002,2nd hand laptops!!! The laptop I received alre...,-0.042614,Bad
3,HT-1003,100000003,Review: BEST LAPTOP IN A PRICE SEGMENT OF Rs 5...,1.000000,Excellent
4,HT-1010,100000004,Review: second hand product. don't buy this. I...,0.000000,Neutral
...,...,...,...,...,...
80,HT-1020,100000035,Review: Good product. But we are not getting o...,0.700000,Good
81,HT-1030,100000036,Review: Finally a MacBook possession. Worth ev...,0.150000,Neutral
82,HT-1007,100000037,Review: Really good is and it is very smooth e...,0.673333,Good
83,HT-1040,100000038,Review: Amazing product by Apple and great del...,0.700000,Good


### step 3:
Creating features based on the presence of specific n-grams that are hypothesized to be strong indicators of positive or negative sentiment.