In [10]:
import pandas as pd
import numpy as np
import re

In [2]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

In [3]:
%%time
df1 = pd.read_excel('Social_Listening_Excel_SQL_JAN17_AUG20.xlsx')
df2 = pd.read_excel('SOCIAL_LISTENING_EXCEL_SQL_SEP`20_MAY`21.xlsx')
df = pd.concat([df1[['Brand', 'Sentiment', 'Content']], df1[['Brand', 'Sentiment', 'Content']]], ignore_index=True)
df.shape

Wall time: 3min 22s


(999486, 3)

In [4]:
df.head()

Unnamed: 0,Brand,Sentiment,Content
0,Scorpio,Neutral,"16 hours ago - First introduced in 2007, the M..."
1,Scorpio,Neutral,1 day ago - CarWale - Used Mahindra Scorpio [2...
2,Scorpio,Neutral,20 hours ago - CarWale - Used Mahindra Scorpio...
3,Scorpio,Neutral,1 hour ago - The 2017 Mahindra Scorpio Getaway...
4,Scorpio,Neutral,19 hours ago - CarWale - Used Mahindra Scorpio...


In [5]:
def preprocess_df(df):
    df = df.copy()
    df = df[~df['Content'].isna()]
    df.drop_duplicates(subset=['Content'], inplace=True)
    return df

In [8]:
df[~df['Content'].isna()].shape

(996706, 3)

In [9]:
df[~df['Content'].isna()].Content.nunique()

302462

In [6]:
processed_df = preprocess_df(df)
processed_df.shape

(302462, 3)

In [31]:
def preprocess_text(text):
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    # it will remove hashtags. We have to be careful here not to remove 
    # the whole hashtag because text of hashtags contains huge information. 
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', text)
    # it will remove single numeric terms in the tweet. 
    text = re.sub(r'[0-9]', '', text)
    text = re.sub(r'day ago - |hours ago - |hour ago|CarWale', '', text)
    return text

In [32]:
%%time
processed_df['processed_content'] = processed_df['Content'].apply(lambda x: preprocess_text(str(x)))

Wall time: 2.48 s


In [34]:
processed_df.head()

Unnamed: 0,Brand,Sentiment,Content,processed
0,Scorpio,Neutral,"16 hours ago - First introduced in 2007, the M...","First introduced in , the Mahindra Getaway is..."
1,Scorpio,Neutral,1 day ago - CarWale - Used Mahindra Scorpio [2...,- Used Mahindra Scorpio [-] Ex for sale in H...
2,Scorpio,Neutral,20 hours ago - CarWale - Used Mahindra Scorpio...,- Used Mahindra Scorpio [-] Ex for sale in H...
3,Scorpio,Neutral,1 hour ago - The 2017 Mahindra Scorpio Getaway...,- The Mahindra Scorpio Getaway has been spo...
4,Scorpio,Neutral,19 hours ago - CarWale - Used Mahindra Scorpio...,- Used Mahindra Scorpio [-] VLX WD BS-III fo...


In [37]:
%%time
processed_df['vader_compound_score'] = processed_df['processed_content'].apply(lambda x: analyzer.polarity_scores(str(x))['compound'])

Wall time: 1min 26s


In [38]:
processed_df['vader_sentiment'] = processed_df['vader_compound_score'].apply(lambda x: 'Positive' if x > 0 else ('Neutral' if x == 0 else 'Negative'))

In [39]:
processed_df

Unnamed: 0,Brand,Sentiment,Content,processed,vader_compound_score,vader_sentiment
0,Scorpio,Neutral,"16 hours ago - First introduced in 2007, the M...","First introduced in , the Mahindra Getaway is...",0.3612,Positive
1,Scorpio,Neutral,1 day ago - CarWale - Used Mahindra Scorpio [2...,- Used Mahindra Scorpio [-] Ex for sale in H...,0.0772,Positive
2,Scorpio,Neutral,20 hours ago - CarWale - Used Mahindra Scorpio...,- Used Mahindra Scorpio [-] Ex for sale in H...,0.0772,Positive
3,Scorpio,Neutral,1 hour ago - The 2017 Mahindra Scorpio Getaway...,- The Mahindra Scorpio Getaway has been spo...,0.0000,Neutral
4,Scorpio,Neutral,19 hours ago - CarWale - Used Mahindra Scorpio...,- Used Mahindra Scorpio [-] VLX WD BS-III fo...,0.6633,Positive
...,...,...,...,...,...,...
499738,Marazzo,Neutral,@anandmahindra https://t.co/dngwphbRES. The p...,@anandmahindra,0.0000,Neutral
499739,Marazzo,Neutral,@anandmahindra सर आप यह गाड़ी महान है अपने देश...,@anandmahindra सर आप यह गाड़ी महान है अपने देश...,0.3182,Positive
499740,Marazzo,Neutral,Mahindra Marazzo review. The shark inspired MU...,Mahindra Marazzo review. The shark inspired MU...,0.7345,Positive
499741,Marazzo,Neutral,Mahindra cars price list in India *All prices ...,Mahindra cars price list in India *All prices ...,0.8000,Positive


In [43]:
processed_df['Content'][0]

'16 hours ago - First introduced in 2007, the Mahindra Getaway is due for an upgrade. The Scorpio-based pick-up is sold as Pik-Up in a few international markets like Australia#160;...'

In [None]:
# processed_df.to_excel('result.xlsx', index=False)