In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score

In [2]:
from sentiment_explorerVersion7 import *

In [3]:
def pdColumn_to_list_converter(df):
    df_list = df.values.tolist() #produces list of lists
    proper_list = [item for sublist in df_list for item in sublist] #a single list
    return proper_list

In [4]:
#df = pd.read_csv('test-readynew.csv')  #import reviews
df = pd.read_csv('sg-transport-1115-clean.csv')  #import reviews
df.head()

Unnamed: 0,Sentiment,Sentiment Num,Multi,Text
0,Neutral,0,0,Effective but too-tepid biopic
1,Positive,1,1,If you sometimes like to go to ...
2,Positive,1,2,"Emerges as something rare , an i..."
3,Neutral,0,0,The film provides some great insigh...
4,Positive,1,2,Offers that rare combination of en...


In [5]:
df['Text-original'] = df['Text']

### Removing Punctuations From Data

In [6]:
import re

def newtext(text):
    text = text.strip()
    text = text.replace('/\s\s+/g', ' ') # replace multiple spaces with a single space
    text = text.replace(":)","happy")
    text = text.replace(":(","sad")
    text = re.sub ('\s+', ' ', text)
    text = re.sub('@[^\s]+','',text)  # delete the username
    text = re.sub('&[^\s]+','',text)
    text = re.sub('#[^\s]+','',text)
    text = re.sub('".*?"', '', text)  # delete anything in quotation marks
    text = re.sub('http[s]?://\S+', '', text) # delete urls
    text = text.replace("as well as","and")
    text = text.replace("as well","also")
    text = re.sub("\S*@\S*\s?",'',text)   # delete email address
    text = text.replace('\n', ' ').replace('\r', '')  # Clean up all "\n"
    
    text = re.sub(r"""
               [,.;@#?!&$]+  # Accept one or more copies of punctuation
               \ *           # plus zero or more copies of a space,
               """,
               "",          # and replace it with no space
               text, flags=re.VERBOSE)
    text= re.sub(' +', ' ', text)
    #text= re.sub(':', '', text)
    text= re.sub("[:']", '', text)
    #text = re.sub ('\s+', '', text)
    return text.lower()



In [7]:
def newtext_fullstop(text):
    text = text.strip()
    text = text.replace('/\s\s+/g', ' ') # replace multiple spaces with a single space
    text = text.replace(":)","happy")
    text = text.replace(":(","sad")
    text = re.sub ('\s+', ' ', text)
    text = re.sub('@[^\s]+','',text)  # delete the username
    text = re.sub('&[^\s]+','',text)
    text = re.sub('#[^\s]+','',text)
    text = re.sub('".*?"', '', text)  # delete anything in quotation marks
    text = re.sub('http[s]?://\S+', '', text) # delete urls
    text = text.replace("as well as","and")
    text = text.replace("as well","also")
    text = re.sub("\S*@\S*\s?",'',text)   # delete email address
    text = text.replace('\n', ' ').replace('\r', '')  # Clean up all "\n"
    
    text = re.sub(r"""
               [,;@#&$]+  # Accept one or more copies of punctuation
               \ *           # plus zero or more copies of a space,
               """,
               "",          # and replace it with no space
               text, flags=re.VERBOSE)
    text = text.replace('.', ' .') #specially added to maintain fullstop
    text = text.replace('?', ' ?') #specially added to maintain question mark
    text = text.replace('!', ' !') #specially added to maintain exclamation mark
    text= re.sub(' +', ' ', text)
    #text= re.sub(':', '', text)
    text= re.sub("[:']", '', text)
    #text = re.sub ('\s+', '', text)
    return text.lower()

In [8]:
df["Text_with_fullstop"] = df['Text'].apply(lambda text: newtext_fullstop(text))

In [9]:
df["Text"] = df['Text'].apply(lambda text: newtext(text))

In [10]:
df.drop(df.columns[0], axis=1)

Unnamed: 0,Sentiment Num,Multi,Text,Text-original,Text_with_fullstop
0,0,0,effective but too-tepid biopic,Effective but too-tepid biopic,effective but too-tepid biopic
1,1,1,if you sometimes like to go to the movies to h...,If you sometimes like to go to ...,if you sometimes like to go to the movies to h...
2,1,2,emerges as something rare an issue movie that ...,"Emerges as something rare , an i...",emerges as something rare an issue movie that ...
3,0,0,the film provides some great insight into the ...,The film provides some great insigh...,the film provides some great insight into the ...
4,1,2,offers that rare combination of entertainment ...,Offers that rare combination of en...,offers that rare combination of entertainment ...
...,...,...,...,...,...
2205,1,1,an imaginative comedy\/thriller,An imaginative comedy\/thriller .,an imaginative comedy\/thriller .
2206,1,2,-lrb- a -rrb- rare beautiful film,"-LRB- A -RRB- rare , beautiful fil...",-lrb- a -rrb- rare beautiful film .
2207,1,2,-lrb- an -rrb- hilarious romantic comedy,-LRB- An -RRB- hilarious romantic co...,-lrb- an -rrb- hilarious romantic comedy .
2208,1,1,never -lrb- sinks -rrb- into exploitation,Never -LRB- sinks -RRB- into exploit...,never -lrb- sinks -rrb- into exploitation .


### Counting accuracy

In [11]:
acc_dict = {}

## Step 1: Using Prof Wang's Standard English Dictionary Only

In [12]:
df["Polarities Found"] = df['Text'].apply(lambda text: findPolarity(' '.join(text.split())))

In [13]:
df["Polarity Count"] = df['Polarities Found'].apply(lambda scores: countPolarity(scores))

In [14]:
acc_dict['Prof Wang Standard English Only'] = accuracy_score(df["Sentiment Num"], df["Polarity Count"])

In [15]:
acc_dict

{'Prof Wang Standard English Only': 0.5678733031674208}

In [16]:
df_confusion = pd.crosstab(df['Polarity Count'],df["Sentiment Num"] , rownames=['Predicted'], colnames=['Actual'], margins= True)
labels = [1, 0, -1]
df_confusion = df_confusion.reindex(labels, axis="columns")
df_confusion = df_confusion.reindex(labels, axis="rows")
df_confusion.loc['All'] = df_confusion.loc[1] + df_confusion.loc[0] + df_confusion.loc[-1]
df_confusion

Actual,1,0,-1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,614,164,242
0,58,67,96
-1,237,158,574
All,909,389,912


In [17]:
df = df.drop(df.columns[-2:], axis=1)


## Step2 : Using Prof Wang's Standard el + Prof Wang Singlish Dict 

In [18]:
df["Polarities Found"] = df['Text'].apply(lambda text: findPolarity1(' '.join(text.split())))

In [19]:
df["Polarity Count"] = df['Polarities Found'].apply(lambda scores: countPolarity1(scores))

In [20]:
acc_dict['Prof Wang Standard English + Prof Wang Singlish'] = accuracy_score(df["Sentiment Num"], df["Polarity Count"])

In [21]:

acc_dict

{'Prof Wang Standard English Only': 0.5678733031674208,
 'Prof Wang Standard English + Prof Wang Singlish': 0.5678733031674208}

In [22]:
df_confusion = pd.crosstab(df['Polarity Count'],df["Sentiment Num"] , rownames=['Predicted'], colnames=['Actual'], margins= True)
labels = [1, 0, -1]
df_confusion = df_confusion.reindex(labels, axis="columns")
df_confusion = df_confusion.reindex(labels, axis="rows")
df_confusion.loc['All'] = df_confusion.loc[1] + df_confusion.loc[0] + df_confusion.loc[-1]
df_confusion

Actual,1,0,-1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,614,164,242
0,58,67,96
-1,237,158,574
All,909,389,912


In [23]:
df = df.drop(df.columns[-2:], axis=1)

# Step 3: Combined Standard EL + Combined Singlish

In [24]:
df["Polarities Found"] = df['Text'].apply(lambda text: findPolarity2(' '.join(text.split())))

In [25]:
df["Polarity Count"] = df['Polarities Found'].apply(lambda scores: countPolarity2(scores))

In [26]:
acc_dict['Combined Standard English + Combined Singlish'] = accuracy_score(df["Sentiment Num"], df["Polarity Count"])

In [27]:
acc_dict

{'Prof Wang Standard English Only': 0.5678733031674208,
 'Prof Wang Standard English + Prof Wang Singlish': 0.5678733031674208,
 'Combined Standard English + Combined Singlish': 0.5678733031674208}

In [28]:
df_confusion = pd.crosstab(df['Polarity Count'],df["Sentiment Num"] , rownames=['Predicted'], colnames=['Actual'], margins= True)
labels = [1, 0, -1]
df_confusion = df_confusion.reindex(labels, axis="columns")
df_confusion = df_confusion.reindex(labels, axis="rows")
df_confusion.loc['All'] = df_confusion.loc[1] + df_confusion.loc[0] + df_confusion.loc[-1]
df_confusion

Actual,1,0,-1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,612,162,242
0,57,67,94
-1,240,160,576
All,909,389,912


In [29]:
df = df.drop(df.columns[-2:], axis=1)

In [30]:
df

Unnamed: 0,Sentiment,Sentiment Num,Multi,Text,Text-original,Text_with_fullstop
0,Neutral,0,0,effective but too-tepid biopic,Effective but too-tepid biopic,effective but too-tepid biopic
1,Positive,1,1,if you sometimes like to go to the movies to h...,If you sometimes like to go to ...,if you sometimes like to go to the movies to h...
2,Positive,1,2,emerges as something rare an issue movie that ...,"Emerges as something rare , an i...",emerges as something rare an issue movie that ...
3,Neutral,0,0,the film provides some great insight into the ...,The film provides some great insigh...,the film provides some great insight into the ...
4,Positive,1,2,offers that rare combination of entertainment ...,Offers that rare combination of en...,offers that rare combination of entertainment ...
...,...,...,...,...,...,...
2205,Positive,1,1,an imaginative comedy\/thriller,An imaginative comedy\/thriller .,an imaginative comedy\/thriller .
2206,Positive,1,2,-lrb- a -rrb- rare beautiful film,"-LRB- A -RRB- rare , beautiful fil...",-lrb- a -rrb- rare beautiful film .
2207,Positive,1,2,-lrb- an -rrb- hilarious romantic comedy,-LRB- An -RRB- hilarious romantic co...,-lrb- an -rrb- hilarious romantic comedy .
2208,Positive,1,1,never -lrb- sinks -rrb- into exploitation,Never -LRB- sinks -RRB- into exploit...,never -lrb- sinks -rrb- into exploitation .


# Step 4: Combined Standard EL + Combined Singlish + Transport Domain

In [31]:
df["Polarities Found"] = df['Text'].apply(lambda text: findPolarity3(' '.join(text.split())))

In [32]:
df["Polarity Count"] = df['Polarities Found'].apply(lambda scores: countPolarity3(scores))

In [33]:
acc_dict['Combined Standard English + Combined Singlish + Transport Domain'] = accuracy_score(df["Sentiment Num"], df["Polarity Count"])

In [34]:
acc_dict

{'Prof Wang Standard English Only': 0.5678733031674208,
 'Prof Wang Standard English + Prof Wang Singlish': 0.5678733031674208,
 'Combined Standard English + Combined Singlish': 0.5678733031674208,
 'Combined Standard English + Combined Singlish + Transport Domain': 0.567420814479638}

In [35]:
df_confusion = pd.crosstab(df['Polarity Count'],df["Sentiment Num"] , rownames=['Predicted'], colnames=['Actual'], margins= True)
labels = [1, 0, -1]
df_confusion = df_confusion.reindex(labels, axis="columns")
df_confusion = df_confusion.reindex(labels, axis="rows")
df_confusion.loc['All'] = df_confusion.loc[1] + df_confusion.loc[0] + df_confusion.loc[-1]
df_confusion

Actual,1,0,-1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,599,159,229
0,56,63,91
-1,254,167,592
All,909,389,912


In [36]:
df = df.drop(df.columns[-2:], axis=1)

In [37]:
# df = df.drop(['Polarities Found'], axis=1)

In [38]:
# df = df.rename(columns={"Polarity Count": "Polarity Count after Transport Domain"})

# Step 5: Combined Standard EL + Combined Singlish + Transport Domain + Negation

In [39]:
df["Polarities Found"] = df['Text_with_fullstop'].apply(lambda text: findPolarity4(' '.join(text.split())))

In [40]:
df["Polarity Count"] = df['Polarities Found'].apply(lambda scores: countPolarity4(scores, 7))

In [41]:
acc_dict['Combined Standard English + Combined Singlish + Transport Domain + Negation'] = accuracy_score(df["Sentiment Num"], df["Polarity Count"])

In [42]:
acc_dict

{'Prof Wang Standard English Only': 0.5678733031674208,
 'Prof Wang Standard English + Prof Wang Singlish': 0.5678733031674208,
 'Combined Standard English + Combined Singlish': 0.5678733031674208,
 'Combined Standard English + Combined Singlish + Transport Domain': 0.567420814479638,
 'Combined Standard English + Combined Singlish + Transport Domain + Negation': 0.5710407239819004}

In [43]:
df_confusion = pd.crosstab(df['Polarity Count'],df["Sentiment Num"] , rownames=['Predicted'], colnames=['Actual'], margins= True)
labels = [1, 0, -1]
df_confusion = df_confusion.reindex(labels, axis="columns")
df_confusion = df_confusion.reindex(labels, axis="rows")
df_confusion.loc['All'] = df_confusion.loc[1] + df_confusion.loc[0] + df_confusion.loc[-1]
df_confusion

Actual,1,0,-1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,598,157,220
0,56,63,91
-1,255,169,601
All,909,389,912


In [44]:
# df.to_csv("checking.csv")

In [45]:
df = df.drop(df.columns[-2:], axis=1)

In [46]:
# df = df.rename(columns={"Polarity Count": "Polarity Count after Negation"})

In [47]:
# df_wrongly_labelled_negation = df[df['Polarity Count after Negation'] != df['Sentiment Num']]

In [48]:
# df_wrongly_labelled_negation.to_csv('wrongly_labelled_negation.csv')

In [49]:
# df_wrongly_labelled_negation_vs_transport = df_wrongly_labelled_negation[df_wrongly_labelled_negation['Polarity Count after Negation'] != df_wrongly_labelled_negation['Polarity Count after Transport Domain']]

In [50]:
# df_wrongly_labelled_negation_vs_transport

In [51]:
# df_wrongly_labelled_negation_vs_transport_correct = df_wrongly_labelled_negation_vs_transport[df_wrongly_labelled_negation_vs_transport['Sentiment Num'] == df_wrongly_labelled_negation_vs_transport['Polarity Count after Transport Domain']]

In [52]:
# df_wrongly_labelled_negation_vs_transport_correct

In [53]:
# df_wrongly_labelled_negation_vs_transport_correct.to_csv('transport_correct_label_negation_wrong.csv')

# Step 6: Combined Standard EL + Combined Singlish + Transport Domain + Negation + Too Handling

In [None]:
df["Polarities Found"] = df['Text'].apply(lambda text: findPolarity4_too(' '.join(text.split())))

In [None]:
df["Polarity Count"] = df['Polarities Found'].apply(lambda scores: countPolarity4(scores, 7))

In [None]:
acc_dict['Combined Standard English + Combined Singlish + Transport Domain + Negation + Too Handling'] = accuracy_score(df["Sentiment Num"], df["Polarity Count"])

In [None]:
acc_dict

In [None]:
df_confusion = pd.crosstab(df['Polarity Count'],df["Sentiment Num"] , rownames=['Predicted'], colnames=['Actual'], margins= True)
labels = [1, 0, -1]
df_confusion = df_confusion.reindex(labels, axis="columns")
df_confusion = df_confusion.reindex(labels, axis="rows")
df_confusion.loc['All'] = df_confusion.loc[1] + df_confusion.loc[0] + df_confusion.loc[-1]
df_confusion

# Step 6.1: Combined Standard EL + Combined Singlish + Transport Domain + Negation + Too Handling + Like Handling

In [None]:
df["Polarities Found"] = df['Text'].apply(lambda text: findPolarity4_too_like(' '.join(text.split())))

In [None]:
df["Polarity Count"] = df['Polarities Found'].apply(lambda scores: countPolarity4(scores, 7))

In [None]:
acc_dict['Combined Standard English + Combined Singlish + Transport Domain + Negation + Too Handling + Like Handling'] = accuracy_score(df["Sentiment Num"], df["Polarity Count"])

In [None]:
acc_dict

In [None]:
df_confusion = pd.crosstab(df['Polarity Count'],df["Sentiment Num"] , rownames=['Predicted'], colnames=['Actual'], margins= True)
labels = [1, 0, -1]
df_confusion = df_confusion.reindex(labels, axis="columns")
df_confusion = df_confusion.reindex(labels, axis="rows")
df_confusion.loc['All'] = df_confusion.loc[1] + df_confusion.loc[0] + df_confusion.loc[-1]
df_confusion

# Step 6.2: Combined Standard EL + Combined Singlish + Transport Domain + Negation + Too Handling + Like Handling + Question mark Handling

In [None]:
#df.head()

In [None]:
def qn_mark(original_text, polarity):
    fivewoneh=['what','why','who','where','when','how', 'What','Why','Who','Where','When','How']
    if '?' in original_text:
        original_text = original_text.strip()
        if original_text.split(" ")[0] not in fivewoneh:
            polarity=-1
    else:
        polarity=polarity
    return polarity
        

In [None]:
df["Polarity Count"] = df.apply(lambda a: qn_mark(a['Text-original'],a['Polarity Count']),axis=1)

In [None]:
#df.head()

In [None]:
acc_dict['Combined Standard English + Combined Singlish + Transport Domain + Negation + Too Handling + Like Handling + Qn Mark Handling'] = accuracy_score(df["Sentiment Num"], df["Polarity Count"])

In [None]:
acc_dict

In [None]:
df_confusion = pd.crosstab(df['Polarity Count'],df["Sentiment Num"] , rownames=['Predicted'], colnames=['Actual'], margins= True)
labels = [1, 0, -1]
df_confusion = df_confusion.reindex(labels, axis="columns")
df_confusion = df_confusion.reindex(labels, axis="rows")
df_confusion.loc['All'] = df_confusion.loc[1] + df_confusion.loc[0] + df_confusion.loc[-1]
df_confusion

# Step 7: Combined Standard EL + Combined Singlish + Transport Domain + Negation + Too Handling + Sarcasm

In [None]:
df["Sarcasm?"] = df['Polarities Found'].apply(lambda row: recognise_sarcasm(row))

In [None]:

df.loc[(df['Sarcasm?'] != 0), 'Polarity Count'] = -1

In [None]:
acc_dict['Combined Standard English + Combined Singlish + Transport Domain + Nagation + Too Handling + Like Handling + Sarcasm'] = accuracy_score(df["Sentiment Num"], df["Polarity Count"])

In [None]:
acc_dict

In [None]:
df_confusion = pd.crosstab(df['Polarity Count'],df["Sentiment Num"] , rownames=['Predicted'], colnames=['Actual'], margins= True)
labels = [1, 0, -1]
df_confusion = df_confusion.reindex(labels, axis="columns")
df_confusion = df_confusion.reindex(labels, axis="rows")
df_confusion.loc['All'] = df_confusion.loc[1] + df_confusion.loc[0] + df_confusion.loc[-1]
df_confusion

In [None]:
sarcasm_df = df

In [None]:
sarcasm_df

In [None]:
sarcasm_df = sarcasm_df.rename(columns={'Polarity Count': 'Sarcasm Polarity Count'})

In [None]:
sarcasm_df

# Step 8: Combined Standard EL + Combined Singlish + Transport Domain + Negation + Too Handling + Sarcasm + Adversative

In [None]:

df["Polarities Found"] = df['Text'].apply(lambda text: findPolarity5(' '.join(text.split())))
df["Adversative Polarity"] = df['Polarities Found'].apply(lambda scores: countPolarity5(scores, 7))

In [None]:
# def flip(sarcasm,polarity_count):
#     if sarcasm==-1:
#         if polarity_count>0:
#             return polarity_count*(-1)
#         else:
#             return polarity_count
#     elif sarcasm==0:
#         return polarity_count

In [None]:
def adversative_present(polarity_list):
    if (8 in polarity_list) or (-8 in polarity_list):
        return 1
    else:
        return 0

# label presence of adversative
df['Adversative Present?']=df['Polarities Found'].apply(lambda pl:adversative_present(pl))

In [None]:
def update_p_after_adversative(present,polaritys, polaritya):
    if present==1:
        return polaritya
    elif present==0:
        return polaritys

df['Polarity Count-after Adversative'] = df.apply(lambda x: update_p_after_adversative(x['Adversative Present?'],x['Polarity Count'],x['Adversative Polarity']),axis=1)

In [None]:
df["Polarity Count-after Adversative"] = df.apply(lambda a: qn_mark(a['Text-original'],a['Polarity Count-after Adversative']),axis=1)

In [None]:
df

In [None]:
# checking
df['Polarity Count-after Adversative'].unique()

In [None]:
adversative_df = df

In [None]:
adversative_df

In [None]:
acc_dict['Combined Standard English + Combined Singlish + Transport Domain + Negation + Too Handling + Like Handling + Sarcasm + Adversative'] = accuracy_score(adversative_df["Sentiment Num"], adversative_df["Polarity Count-after Adversative"])

In [None]:
acc_dict

In [None]:
df_confusion = pd.crosstab(df['Polarity Count-after Adversative'],df["Sentiment Num"] , rownames=['Predicted'], colnames=['Actual'], margins= True)
labels = [1, 0, -1]
df_confusion = df_confusion.reindex(labels, axis="columns")
df_confusion = df_confusion.reindex(labels, axis="rows")
df_confusion.loc['All'] = df_confusion.loc[1] + df_confusion.loc[0] + df_confusion.loc[-1]
df_confusion

# Step 9: Combined Standard EL + Combined Singlish + Transport Domain + Negation + Too Handling + Sarcasm + Adversative +  Emoji

In [None]:
adversative_df

In [None]:
adversative_df["Emoji Score"] = adversative_df['Text'].apply(lambda x: find_emoji(x))
adversative_df.loc[(adversative_df['Polarity Count-after Adversative'] == 0), 'Polarity Count-after Adversative'] = adversative_df['Emoji Score'] #emoji handling only when 0 is present

In [None]:
adversative_df

In [None]:
# for checking
adversative_df['Polarity Count-after Adversative'].unique()

In [None]:
acc_dict['Combined Standard English + Combined Singlish + Transport Domain + Negation + Too Handling + Like Handling + Sarcasm + Adversative+ emoji'] = accuracy_score(adversative_df["Sentiment Num"], adversative_df["Polarity Count-after Adversative"])

In [None]:
acc_dict

In [None]:
df_confusion = pd.crosstab(df['Polarity Count-after Adversative'],df["Sentiment Num"] , rownames=['Predicted'], colnames=['Actual'], margins= True)
labels = [1, 0, -1]
df_confusion = df_confusion.reindex(labels, axis="columns")
df_confusion = df_confusion.reindex(labels, axis="rows")
df_confusion.loc['All'] = df_confusion.loc[1] + df_confusion.loc[0] + df_confusion.loc[-1]
df_confusion




# Step 10: Combined Standard EL + Combined Singlish + Transport Domain + Negation + Too Handling + Sarcasm + Adversative + Emoji + Multi

In [None]:
adversative_df

In [None]:
adversative_df["Polarities Found"] = adversative_df['Text'].apply(lambda text: findPolarity6(' '.join(text.split())))

In [None]:
adversative_df['Polarity Count-multi'] = adversative_df.apply(lambda scores: multi_value(scores['Polarities Found'],scores['Text'], scores['Polarity Count-after Adversative'], 5), axis=1)

In [None]:
adversative_df["Polarity Count-multi"] = adversative_df.apply(lambda a: qn_mark(a['Text-original'],a['Polarity Count-multi']),axis=1)

In [None]:
adversative_df # the last column gives us more information on the strength polarity

In [None]:
# for checking
adversative_df['Polarity Count-multi'].unique()

In [None]:
 def new_multi(value):
    if 0<value<=0.5:
        valuenew=1
    elif value==0:
        valuenew=0
    elif 1>=value>0.5:
        valuenew=2
    elif -0.5<=value<0:
        valuenew=-1
    elif -1<=value<-0.5:
        valuenew=-2
    return valuenew

adversative_df['Polarity Count-multi']=adversative_df['Polarity Count-multi'].apply(new_multi)
# adversative_df['Multi']=adversative_df['Multi'].apply(new_multi)

In [None]:

acc_dict['Combined Standard English + Combined Singlish + Transport Domain + Negation + Too Handling + Like Handling + Sarcasm + Adversative+ Emoji + Multi'] = accuracy_score(adversative_df["Multi"], adversative_df["Polarity Count-multi"])

In [None]:
acc_dict

In [None]:
df_confusion = pd.crosstab(df['Polarity Count-multi'],df["Multi"] , rownames=['Predicted'], colnames=['Actual'], margins= True)
labels = [2, 1, 0, -1, -2]
df_confusion = df_confusion.reindex(labels, axis="columns")
df_confusion = df_confusion.reindex(labels, axis="rows")
df_confusion.loc['All'] = df_confusion.loc[2] + df_confusion.loc[1] + df_confusion.loc[0] + df_confusion.loc[-1] + df_confusion.loc[-2]
df_confusion

In [None]:
adversative_df.head()

In [None]:
# for checking
adversative_df['Polarity Count-multi'].unique()

In [None]:
# uncomment the following for checking
adversative_df.to_csv('Results-lexiconbased-23Julytest.csv')

# Step 11: Evaluation

In [None]:
# for adversative since multi is in another ipynb
df_confusion = pd.crosstab(adversative_df['Sentiment Num'],adversative_df["Polarity Count-after Adversative"] , rownames=['Actual'], colnames=['Predicted'], margins=True)
labels = [1, 0, -1]
df_confusion = df_confusion.reindex(labels, axis="columns")
df_confusion = df_confusion.reindex(labels, axis="rows")
# df_confusion.loc['All'] = df_confusion.loc[1] + df_confusion.loc[0] + df_confusion.loc[-1]

In [None]:
df_confusion

In [None]:
# normalised confusion matrix
df_conf_norm = df_confusion / df_confusion.sum(axis=1)
df_conf_norm

In [None]:
# confusion matrix plot
import matplotlib.pyplot as plt
import numpy as np

def plot_confusion_matrix(df_confusion, title='Confusion matrix', cmap=plt.cm.gray_r):
    plt.matshow(df_confusion, cmap=cmap) # imshow
    #plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(df_confusion.columns))
    plt.xticks(tick_marks, df_confusion.columns, rotation=45)
    plt.yticks(tick_marks, df_confusion.index)
    #plt.tight_layout()
    plt.ylabel(df_confusion.index.name)
    plt.xlabel(df_confusion.columns.name)

plot_confusion_matrix(df_confusion)


In [None]:
# plot normalized confusion matrix

plot_confusion_matrix(df_conf_norm)  

In [None]:
# f1 score
from sklearn.metrics import f1_score
f1_score(adversative_df['Sentiment Num'], adversative_df['Polarity Count-after Adversative'], average='weighted')

In [None]:
f1_score(adversative_df['Sentiment Num'], adversative_df['Polarity Count-after Adversative'], average='micro')

In [None]:
f1_score(adversative_df['Sentiment Num'], adversative_df['Polarity Count-after Adversative'], average='macro')

##  Step 12: Cross Validation

In [None]:
# # shuffle the data

# df = df.sample(frac=1).reset_index(drop=True)
# print(df)

In [None]:
# df1=df[:229]
# print(df1)
# df1.to_csv('df1.csv')

In [None]:
# df2=df[229:458]
# print(df2)
# df2.to_csv('df2.csv')

In [None]:
# df3=df[458:687]
# print(df3)
# df3.to_csv('df3.csv')

In [None]:
# df4=df[687:916]
# print(df4)
# df4.to_csv('df4.csv')

In [None]:
# df5=df[916:]
# print(df5)
# df5.to_csv('df5.csv')

In [None]:
# rerun the model on each df to test if the accuracy is stable

# Step 13: Majority Voting

In [None]:
print(df.head())

In [None]:
df.groupby(by='Sentiment').count()

In [None]:
df['allnegative']=-1

In [None]:
print(df.head())

In [None]:
from sklearn.metrics import accuracy_score
# supposed our model predict all neutral, benchmark
print(accuracy_score(df["Sentiment Num"], df["allnegative"]))