In [1]:
#import/install all packages at the top

import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
nlp = spacy.load("en_core_web_sm")
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
from spacy.lang.en.stop_words import STOP_WORDS
import string

from features_stats import *
from classification_stats import *

# Sections:

# (1) Import dataset and split

# (2) Functions

# (3) Validation Testing (obtain stats ready for comparison)
------- get stats and create summary df
- (3.1) Average Word Count
- (3.2) Average Sentence Count
- (3.3) Punctuation Richness
- (3.4) Sarcasm Symbol
- (3.5) Upper-case Words
- (3.6) (Verb) Lemmas
- (3.7) Sentiment Classification

- (3.8) Individual Punctuation Count
- (3.9) Word Type Count
- (3.10) Named Entity Count

# (4) Additional Functions for Classification
# (5) Classification
# (5) Classification Results
# (6) Accuracy Score

# (1) Import and Split

In [2]:
#Import and Read file as DF with PANDAS (for better visualisation)
filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/irony-labeled.csv"
gold_label = pd.read_csv(filename)

In [3]:
#Rename the columns
gold_label.columns = ["Comment_Text", "Label"]

In [4]:
#Split to get two DFs (prep for split)

y = gold_label["Comment_Text"]
x = gold_label["Label"]

In [5]:
#Split the dataset into TEST and TRAIN sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=43)

#Split the TRAIN set again to get VALIDATION set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=43)

In [6]:
#JOIN the series together to get final splits as DFs
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)
val = pd.concat([X_val, y_val], axis=1)

In [7]:
#Check number of comments labelled as ironic vs non-ironic
ironic_val = val[val["Label"] == 1]
nonironic_val = val[val["Label"] == -1]

print(f"Validation data contains {len(ironic_val)} IRONIC comments")
print(f"Validation data contains {len(nonironic_val)} NON- IRONIC comments")

Validation data contains 47 IRONIC comments
Validation data contains 148 NON- IRONIC comments


In [8]:
#Convert TEST(validation) set into a dictionary
val_dict = val.set_index(val.index).T.to_dict()

print(len(val_dict))

195


# (3) Validation Tetsing
- Obtain all statistics (based on training) ready for comparison in next section

In [9]:
#1) GET ALL TOKENS
tokens = get_all_tokens(val_dict)

In [10]:
#2) Get list of ONLY words (no punct)
word_list = get_words(tokens)

In [11]:
#3) Get list of ONLY punct (no words)
punct_list = get_punct(tokens)

In [12]:
#4) WORD LENGTH
average_word_list = []
for comment in word_list:
    average_word_list.append(average_word_length(comment))

print(len(average_word_list))    
    
#Create DataFrame for Summary of Irony STATS
summary= pd.DataFrame({"Average Word Length": average_word_list})

195


In [13]:
#Create df for total, full returns for irony
total_val= pd.DataFrame({'Comment Parsed':tokens})
total_val["Tokens"] = word_list
total_val["Punctuation"] = punct_list
total_val.head()

Unnamed: 0,Comment Parsed,Tokens,Punctuation
0,"(Cola, costs, more, per, litre, than, petrol, ...","[Cola, costs, more, per, litre, than, petrol, ...",[.]
1,"(Damn, ., , I, ca, n't, believe, this, book,...","[Damn, , I, ca, n't, believe, this, book, se...","[., ., ,, .]"
2,"(My, mother, -, in, -, law, watches, way, to, ...","[My, mother, in, law, watches, way, to, much, ...","[-, -, ,, ,, ,, ., ,, ., ,, "", ,, ,, !, !, "", .]"
3,"(Do, you, know, what, type, of, salt, was, use...","[Do, you, know, what, type, of, salt, was, use...",[?]
4,"(Also, ,, I, 'm, pretty, sure, surveys, have, ...","[Also, I, 'm, pretty, sure, surveys, have, sho...","[,, ., ,, .]"


In [14]:
#4) SENTENCE LENGTH
average_sentence_list = []
for x in tokens:
    average_sentence_list.append(average_sent_length(x))

#Add to Summary of Irony STATS df
summary["Average Sentence Length"] = average_sentence_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length
0,4.555556,10.0
1,4.08,9.666667
2,3.625,27.2
3,3.666667,13.0
4,4.704545,24.0


In [15]:
#5) /S SYMBOLS

sarcfunc = []
for x in tokens:
    sarcfunc.append(check_sarcsymbol(x))


sarcsymb_list = []        
for l in sarcfunc:
    if len(l) >= 1:
        sarcsymb_list.append(l)
    else:
        sarcsymb_list.append([0])

#Remove list layer 
sarcsymb_list = list(chain.from_iterable(sarcsymb_list))



summary["Average '/s' symbol count"] = sarcsymb_list

In [16]:
#7) UPPER CASE WORDS (total)

uppercase_list = []
for b in tokens:
    uppercase_list.append((count_uppercase(b)))
    
#Remove list layer 
uppercase_list = list(chain.from_iterable(uppercase_list))

summary["Average Upper-case Words"] = uppercase_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length,Average '/s' symbol count,Average Upper-case Words
0,4.555556,10.0,0.0,0.0
1,4.08,9.666667,0.0,0.034483
2,3.625,27.2,0.0,0.051471
3,3.666667,13.0,0.0,0.0
4,4.704545,24.0,0.0,0.020833


In [17]:
#6) PUNCTUATION RICHNESS
average_punct_list = get_punct_average(punct_list, tokens)

summary["Punctuation Richness"] = average_punct_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length,Average '/s' symbol count,Average Upper-case Words,Punctuation Richness
0,4.555556,10.0,0.0,0.0,6.0
1,4.08,9.666667,0.0,0.034483,18.5
2,3.625,27.2,0.0,0.051471,84.0
3,3.666667,13.0,0.0,0.0,7.5
4,4.704545,24.0,0.0,0.020833,28.0


In [18]:
#9) AVERAGE NUMBER OF LEMMAS

lemma_list = []
for doc in tokens:
    lemma_list.append(get_lemmas(doc))
    
summary["Verb Lemma Average"] = lemma_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length,Average '/s' symbol count,Average Upper-case Words,Punctuation Richness,Verb Lemma Average
0,4.555556,10.0,0.0,0.0,6.0,0.1
1,4.08,9.666667,0.0,0.034483,18.5,0.241379
2,3.625,27.2,0.0,0.051471,84.0,0.132353
3,3.666667,13.0,0.0,0.0,7.5,0.307692
4,4.704545,24.0,0.0,0.020833,28.0,0.1875


In [19]:
#10 SENTIMENT CLASSIFICATION
#1 = positive, -1 = negative

sentiment = get_sentiment(val_dict)

summary["Sentiment Classification"] = sentiment 
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length,Average '/s' symbol count,Average Upper-case Words,Punctuation Richness,Verb Lemma Average,Sentiment Classification
0,4.555556,10.0,0.0,0.0,6.0,0.1,-1
1,4.08,9.666667,0.0,0.034483,18.5,0.241379,1
2,3.625,27.2,0.0,0.051471,84.0,0.132353,-1
3,3.666667,13.0,0.0,0.0,7.5,0.307692,1
4,4.704545,24.0,0.0,0.020833,28.0,0.1875,1


In [20]:
#8) INDIVIDUAL PUNCTUATION AVERAGE

average_indiv_punc_list = []
for x in tokens:
    average_indiv_punc_list.append(get_indiv_punct(x))


summary_indiv_punct = pd.DataFrame(average_indiv_punc_list)
summary_indiv_punct.head()

Unnamed: 0,!,"""",#,%,&,',(,),*,",",...,:,:),;,?,[,],_,–,“,”
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,0.034483,...,,,,,,,,,,
2,0.014706,0.014706,,,,,,,,0.051471,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,0.076923,,,,,,
4,,,,,,,,,,0.041667,...,,,,,,,,,,


In [21]:
#7) WORD TYPE AVERAGE 

average_wordtype_list = []
for comment in tokens:
    average_wordtype_list.append(relative_count_wordtypes(comment))

summary_wordtypedf = pd.DataFrame(average_wordtype_list)
summary_wordtypedf.head()

Unnamed: 0,ADJ,ADP,ADV,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X
0,0.1,0.2,0.2,,,,0.2,,,,0.1,0.1,,,0.1,
1,,0.068966,0.068966,0.034483,0.068966,0.034483,0.172414,,,0.103448,,0.137931,0.068966,,0.241379,
2,0.073529,0.088235,0.029412,0.036765,0.088235,,0.161765,0.014706,0.022059,0.088235,0.014706,0.117647,0.022059,0.007353,0.227941,0.007353
3,0.076923,0.153846,,,,,0.307692,,,0.076923,,0.076923,,,0.307692,
4,0.0625,0.125,0.083333,0.041667,0.020833,,0.1875,,0.041667,0.0625,,0.083333,0.020833,,0.270833,


In [22]:
#10) NAMED ENTITIES AVERAGE
named_entity_list = []
for comment in tokens:
    named_entity_list.append(get_entities(comment))
    
summary_named_entity = pd.DataFrame(named_entity_list)

summary_named_entity = summary_named_entity.replace(np.nan, 0) 
summary_named_entity.head()

Unnamed: 0,CARDINAL,DATE,EVENT,FAC,GPE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.007353,0.014706,0.0,0.0,0.0,0.0,0.0,0.007353,0.0,0.0,0.007353,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020833,0.0,0.0,0.0,0.0,0.0,0.0


# (4) Additional Functions (Classification)

# (5) Classification

Steps:
- (1) Import Master DF ###(1) GENERAL
- (2) Get Results for each comparison using classification function (1) Ir, (2) Non-ir
- (3) Create PredictorDF for (1) Ironic, (2) Non-ironic

''''''''''''''''''''''''''''''
- (4) Import Master DF ###(2) POS
- (5) Get Results for each comparison using classification function (1) Ir, (2) Non-ir
- (6) Add to each PredictorDF for (1) Ironic, (2) Non-ironic

''''''''''''''''''''''''''''''
- (7) Import Master DF ###(3) NAMED ENTITY
- Repeat steps 5 & 6

''''''''''''''''''''''''''''''
- (8) Import Master DF ###(4) PUNCTUATION
- Repeat steps 5 & 6 

In [23]:
####(1)
#import GENERAL summary table
master_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_general.csv"
mastergeneral_df = pd.read_csv(master_filename)
mastergeneral_df.head()

mastergeneral_df = mastergeneral_df.rename(columns={mastergeneral_df.columns[0]: "Class"})

mastergeneral_df

Unnamed: 0,Class,Average Word Length,Average Sentence Length,Average '/s' symbol count,Average Upper-case Words,Punctuation Richness,Verb Lemma Average,Sentiment Classification
0,Ironic Comments,4.304183,14.261968,0.000783,0.02535,20.927649,0.154378,0.405685
1,Non-ironic Comments,4.41101,15.805244,0.0,0.02346,36.150972,0.146618,0.455476


In [24]:
#get results from classification function for IRONIC
ironic_average_word_length = get_classification_ironic(mastergeneral_df, summary, 1, 0, 0.8)
ironic_average_sent_length= get_classification_ironic(mastergeneral_df, summary, 2, 1, 1)
ironic_average_sarcsymb= get_classification_ironic(mastergeneral_df, summary, 3, 2, 7)
ironic_average_uppercase = get_classification_ironic(mastergeneral_df, summary, 4, 3, 2)
ironic_punct_richness = get_classification_ironic(mastergeneral_df, summary, 5, 4, 2)
# ironic_average_verblemma = get_classification_ironic(mastergeneral_df, summary, 6, 5, 1)
# ironic_average_sentiment = get_classification_ironic(mastergeneral_df, summary, 7, 6, 1)

In [25]:
#IRONIC
#Create PREDICTOR DATAFRAME with classifications (all features)

ironic_predictor_df = pd.DataFrame(ironic_average_word_length)
ironic_predictor_df.columns = ['WORD LENGTH'] + ironic_predictor_df.columns.tolist()[1:]

ironic_predictor_df["SENTENCE LENGTH"] = ironic_average_sent_length
ironic_predictor_df["PUNCT RICH"] = ironic_punct_richness
ironic_predictor_df["SARC SYMB /S"] = ironic_average_sarcsymb
ironic_predictor_df["UPPERCASE"] = ironic_average_uppercase
# ironic_predictor_df["Verb Lemma Average"] = ironic_average_verblemma
# ironic_predictor_df["Sentiment Classification"] = ironic_average_sentiment

ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,SARC SYMB /S,UPPERCASE
0,0.201098,4.261968,29.855297,0.005481,0.050701
1,0.179346,4.595301,4.855297,0.005481,0.018265
2,0.543346,12.938032,126.144703,0.005481,0.052241
3,0.510013,1.261968,26.855297,0.005481,0.050701
4,0.32029,9.738032,14.144703,0.005481,0.009034


In [26]:
#get results from classification function for NON-IRONIC
non_ironic_average_word_length = get_classification_non_ironic(mastergeneral_df, summary, 1, 0, 0.8)
non_ironic_average_sent_length= get_classification_non_ironic(mastergeneral_df, summary, 2, 1, 1)
non_ironic_average_sarcsymb= get_classification_non_ironic(mastergeneral_df, summary, 3, 2, 7)
non_ironic_average_uppercase = get_classification_non_ironic(mastergeneral_df, summary, 4, 4, 2)
non_ironic_punct_richness = get_classification_non_ironic(mastergeneral_df, summary, 5, 3, 2)
# non_ironic_average_verblemma = get_classification_non_ironic(mastergeneral_df, summary, 6, 5, 1)
# non_ironic_average_sentiment = get_classification_non_ironic(mastergeneral_df, summary, 7, 6, 1)

In [27]:
#NON-IRONIC
#Create PREDICTOR DATAFRAME with classifications (all features)

non_ironic_predictor_df = pd.DataFrame(non_ironic_average_word_length)
non_ironic_predictor_df.columns = ['WORD LENGTH'] + non_ironic_predictor_df.columns.tolist()[1:]

non_ironic_predictor_df["SENTENCE LENGTH"] = non_ironic_average_sent_length
non_ironic_predictor_df["SARC SYMB /S"] = non_ironic_average_sarcsymb
non_ironic_predictor_df["PUNCT RICH"] = non_ironic_punct_richness
non_ironic_predictor_df["UPPERCASE"] = non_ironic_average_uppercase
# non_ironic_predictor_df["Verb Lemma Average"] = non_ironic_average_verblemma
# non_ironic_predictor_df["Sentiment Classification"] = non_ironic_average_sentiment
non_ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,SARC SYMB /S,PUNCT RICH,UPPERCASE
0,0.115636,5.805244,0.0,72.301945,11.953081
1,0.264808,6.138577,0.0,72.232979,36.953081
2,0.628808,11.394756,0.0,72.199004,167.953081
3,0.595475,2.805244,0.0,72.301945,14.953081
4,0.234828,8.194756,0.0,72.260278,55.953081


In [28]:
####(2)
#import POS summary table
master_wordtype_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_wordtype.csv"
masterwordtype_df = pd.read_csv(master_wordtype_filename)
masterwordtype_df.head()

masterwordtype_df = masterwordtype_df.rename(columns={mastergeneral_df.columns[0]: "Class"}) 
masterwordtype_df.head()

Unnamed: 0.1,Unnamed: 0,ADJ,ADP,ADV,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X
0,Ironic,0.08507,0.07827,0.064795,0.017629,0.073488,0.007642,0.164352,0.007932,0.022388,0.063109,0.058805,0.145819,0.03144,0.004067,0.174846,0.000348
1,Non-ironic,0.080089,0.083053,0.070846,0.023582,0.081,0.005765,0.158435,0.009828,0.022023,0.061435,0.043878,0.141197,0.032398,0.005573,0.180089,0.00081


In [29]:
#get results from classification function for IRONIC
#E.g. PRON, PROPN, NOUN

# ironic_PRON_dist= get_classification_ironic(masterwordtype_df, summary_wordtypedf, 10, 9, 1)
ironic_PROPN_dist_length= get_classification_ironic(masterwordtype_df, summary_wordtypedf, 11, 10, 1)
# ironic_NOUN_dist = get_classification_ironic(masterwordtype_df, summary_wordtypedf, 7, 6, 1)

In [30]:
#Add to IRONIC PREDICTOR DATAFRAME
# ironic_predictor_df["PRON"] = ironic_PRON_dist
ironic_predictor_df["PROPN"] = ironic_PROPN_dist_length
# ironic_predictor_df["NOUN"] = ironic_NOUN_dist
ironic_predictor_df = ironic_predictor_df.replace(np.nan, 0)
ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,SARC SYMB /S,UPPERCASE,PROPN
0,0.201098,4.261968,29.855297,0.005481,0.050701,0.041195
1,0.179346,4.595301,4.855297,0.005481,0.018265,0.0
2,0.543346,12.938032,126.144703,0.005481,0.052241,0.044099
3,0.510013,1.261968,26.855297,0.005481,0.050701,0.0
4,0.32029,9.738032,14.144703,0.005481,0.009034,0.0


In [31]:
#get results from classification function for NON-IRONIC
#E.g. PRON, PROPN, NOUN

# nonironic_PRON_dist= get_classification_non_ironic(masterwordtype_df, summary_wordtypedf, 10, 9, 1)
nonironic_PROPN_dist_length= get_classification_non_ironic(masterwordtype_df, summary_wordtypedf, 11, 10, 1)
# nonironic_NOUN_dist = get_classification_non_ironic(masterwordtype_df, summary_wordtypedf, 7, 6, 1)

In [32]:
#Add to NON-IRONIC PREDICTOR DATAFRAME
# non_ironic_predictor_df["PRON"] = nonironic_PRON_dist
non_ironic_predictor_df["PROPN"] = nonironic_PROPN_dist_length
# non_ironic_predictor_df["NOUN"] = nonironic_NOUN_dist
non_ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,SARC SYMB /S,PUNCT RICH,UPPERCASE,PROPN
0,0.115636,5.805244,0.0,72.301945,11.953081,0.056122
1,0.264808,6.138577,0.0,72.232979,36.953081,
2,0.628808,11.394756,0.0,72.199004,167.953081,0.029172
3,0.595475,2.805244,0.0,72.301945,14.953081,
4,0.234828,8.194756,0.0,72.260278,55.953081,


In [33]:
####(3)
#import NER summary table
master_ner_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_namedentity.csv"
masterentity_df = pd.read_csv(master_ner_filename)
masterentity_df.head()

masterentity_df.rename(columns={mastergeneral_df.columns[0]: "Class"})
masterentity_df = masterentity_df.replace(np.nan, 0)
masterentity_df

Unnamed: 0.1,Unnamed: 0,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,Ironic,0.004144,0.003744,6.6e-05,8e-06,0.006832,0.0,0.000155,0.00082,0.001215,0.00809,0.000965,0.013508,0.001186,0.010383,0.000144,8e-06,0.000418,0.00028
1,Non-ironic,0.004955,0.004464,0.000106,7.8e-05,0.004595,4.4e-05,0.000126,0.000499,0.000849,0.006177,0.000913,0.008829,0.000773,0.00902,4.9e-05,0.000225,0.000444,0.000677


In [34]:
#get results from classification function for IRONIC
#E.g. PERSON, LOC, GPE, LANGUAGE (none)

ironic_PERSON_dist= get_classification_ironic(masterentity_df, summary_named_entity, 14, 13, 4)
# ironic_LOC_dist_length= get_classification_ironic(masterentity_df, summary_named_entity, 8, 7, 1)
ironic_GPE_dist = get_classification_ironic(masterentity_df, summary_named_entity, 5, 4, 5)
# ironic_LANGUAGE_dist = get_classification_ironic(masterentity_df, summary_named_entity, 6, 5, 1)

ironic_ORG_dist= get_classification_ironic(masterentity_df, summary_named_entity, 12, 11, 7)

In [35]:
#Add to IRONIC PREDICTOR DATAFRAME
ironic_predictor_df["PERSON"] = ironic_PERSON_dist
# ironic_predictor_df["LOC"] = ironic_LOC_dist_length
ironic_predictor_df["GPE"] = ironic_GPE_dist
# ironic_predictor_df["LANGUAGE"] = ironic_LANGUAGE_dist
ironic_predictor_df["ORG"] = ironic_ORG_dist

ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,SARC SYMB /S,UPPERCASE,PROPN,PERSON,GPE,ORG
0,0.201098,4.261968,29.855297,0.005481,0.050701,0.041195,0.041533,0.03416,0.094556
1,0.179346,4.595301,4.855297,0.005481,0.018265,0.0,0.041533,0.03416,0.094556
2,0.543346,12.938032,126.144703,0.005481,0.052241,0.044099,0.041533,0.03416,0.094556
3,0.510013,1.261968,26.855297,0.005481,0.050701,0.0,0.041533,0.03416,0.094556
4,0.32029,9.738032,14.144703,0.005481,0.009034,0.0,0.041533,0.03416,0.094556


In [36]:
#get results from classification function for NON- IRONIC
#E.g. PERSON, LOC, GPE, LANGUAGE (none)

nonironic_PERSON_dist= get_classification_non_ironic(masterentity_df, summary_named_entity, 14, 13, 4)
# nonironic_LOC_dist= get_classification_non_ironic(masterentity_df, summary_named_entity, 8, 7, 1)
nonironic_GPE_dist = get_classification_non_ironic(masterentity_df, summary_named_entity, 5, 4, 5)
# nonironic_LANGUAGE_dist = get_classification_non_ironic(masterentity_df, summary_named_entity, 6, 5, 1)
nonironic_ORG_dist = get_classification_non_ironic(masterentity_df, summary_named_entity, 12, 11, 7)

In [37]:
#Add to NON-IRONIC PREDICTOR DATAFRAME
non_ironic_predictor_df["PERSON"] = nonironic_PERSON_dist
# non_ironic_predictor_df["LOC"] = nonironic_LOC_dist
non_ironic_predictor_df["GPE"] = nonironic_GPE_dist
# non_ironic_predictor_df["LANGUAGE"] = nonironic_LANGUAGE_dist
non_ironic_predictor_df["ORG"] = nonironic_ORG_dist


non_ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,SARC SYMB /S,PUNCT RICH,UPPERCASE,PROPN,PERSON,GPE,ORG
0,0.115636,5.805244,0.0,72.301945,11.953081,0.056122,0.036079,0.022975,0.0618
1,0.264808,6.138577,0.0,72.232979,36.953081,,0.036079,0.022975,0.0618
2,0.628808,11.394756,0.0,72.199004,167.953081,0.029172,0.036079,0.022975,0.0618
3,0.595475,2.805244,0.0,72.301945,14.953081,,0.036079,0.022975,0.0618
4,0.234828,8.194756,0.0,72.260278,55.953081,,0.036079,0.022975,0.0618


In [38]:
####(4)
#import PUNCTUATION summary table
master_punct_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_puncttype.csv"
masterpunct_df = pd.read_csv(master_punct_filename)
masterpunct_df.head()

masterpunct_df = masterpunct_df.rename(columns={mastergeneral_df.columns[0]: "Class"})
pd.options.display.max_columns = 40
masterpunct_df

Unnamed: 0.1,Unnamed: 0,!,"""",""")",#,%,&,',(,),*,",",-,--,---,----------,.,..,...,....,.....,......,.......,/,:,:(,:),:-),;,?,[,\,],_,§,–,—,‘,“,”
0,Ironic,0.009634,0.009414,8e-06,0.0,0.0011,0.001022,0.001277,0.000581,0.000797,0.005973,0.027248,0.005296,0.000122,0.0,0.0,0.058454,0.000622,0.003805,0.00105,0.0,0.0,0.0,0.002371,0.002534,0.000103,0.000161,0.0,0.000786,0.014821,0.00049,8.3e-05,0.000421,4.9e-05,0.0,8e-06,7.6e-05,8e-06,0.000287,0.000315
1,Non-ironic,0.003882,0.008162,0.0,2.8e-05,0.000592,0.001495,0.001599,0.002037,0.003004,0.00382,0.024176,0.006119,0.00013,1e-05,6e-06,0.059713,0.000438,0.003519,0.000866,0.000285,5.3e-05,1e-05,0.004311,0.001178,9.9e-05,0.000154,2.5e-05,0.001531,0.01057,0.004692,0.0,0.003895,1.1e-05,1.2e-05,1.2e-05,2.8e-05,1.7e-05,0.00011,0.000129


In [39]:
#get results from classification function for IRONIC
#E.g. !, ', *, :( 

ironic_exclam_dist= get_classification_ironic(masterentity_df, summary_indiv_punct, 1, 0, 8)
# # ironic_apost_dist_length= get_classification_ironic(masterentity_df, summary_indiv_punct, 7, 6, 1)
# ironic_star_dist = get_classification_ironic(masterentity_df, summary_indiv_punct, 10, 9, 1)
# ironic_quest_dist = get_classification_ironic(masterentity_df, summary_indiv_punct, 29, 28, 1)
# ironic_sademoji_dist = get_classification_ironic(masterentity_df, summary_indiv_punct, 25, 24, 1)

In [40]:
#Add to IRONIC PREDICTOR DATAFRAME
ironic_predictor_df["!"] = ironic_exclam_dist
# ironic_predictor_df["'"] = ironic_apost_dist_length
# ironic_predictor_df["*"] = ironic_star_dist
# # ironic_predictor_df["LANGUAGE"] = ironic_LANGUAGE_dist
# ironic_predictor_df.head()



In [41]:
#get results from classification function for NON- IRONIC
#E.g. !, ', *, :( 

nonironic_exclam_dist= get_classification_non_ironic(masterentity_df, summary_indiv_punct, 1, 0, 8)
# nonironic_apost_dist_length= get_classification_non_ironic(masterentity_df, summary_indiv_punct, 7, 6, 1)
# nonironic_star_dist = get_classification_non_ironic(masterentity_df, summary_indiv_punct, 10, 9, 1)
# # ironic_sademoji_dist = get_classification_ironic(masterentity_df, summary_indiv_punct, 25, 24, 1)

In [42]:
#Add to IRONIC PREDICTOR DATAFRAME
non_ironic_predictor_df["!"] = nonironic_exclam_dist
# non_ironic_predictor_df["'"] = nonironic_apost_dist_length
# non_ironic_predictor_df["*"] = nonironic_star_dist
# # ironic_predictor_df["LANGUAGE"] = ironic_LANGUAGE_dist
# non_ironic_predictor_df = non_ironic_predictor_df.replace(np.nan, 0)
non_ironic_predictor_df.head()



Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,SARC SYMB /S,PUNCT RICH,UPPERCASE,PROPN,PERSON,GPE,ORG,!
0,0.115636,5.805244,0.0,72.301945,11.953081,0.056122,0.036079,0.022975,0.0618,
1,0.264808,6.138577,0.0,72.232979,36.953081,,0.036079,0.022975,0.0618,
2,0.628808,11.394756,0.0,72.199004,167.953081,0.029172,0.036079,0.022975,0.0618,0.078006
3,0.595475,2.805244,0.0,72.301945,14.953081,,0.036079,0.022975,0.0618,
4,0.234828,8.194756,0.0,72.260278,55.953081,,0.036079,0.022975,0.0618,


# (6) Classification Results

In [43]:
#calculate the sum of all features for each comment
ironic_feature_prediction = ironic_predictor_df.sum(axis=1)

#add final column to ironic predictor df with feature totals
ironic_predictor_df["Feature Weight"] = ironic_predictor_df.sum(axis=1)
ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,SARC SYMB /S,UPPERCASE,PROPN,PERSON,GPE,ORG,!,Feature Weight
0,0.201098,4.261968,29.855297,0.005481,0.050701,0.041195,0.041533,0.03416,0.094556,,34.585988
1,0.179346,4.595301,4.855297,0.005481,0.018265,0.0,0.041533,0.03416,0.094556,,9.823938
2,0.543346,12.938032,126.144703,0.005481,0.052241,0.044099,0.041533,0.03416,0.094556,0.084494,139.982644
3,0.510013,1.261968,26.855297,0.005481,0.050701,0.0,0.041533,0.03416,0.094556,,28.853707
4,0.32029,9.738032,14.144703,0.005481,0.009034,0.0,0.041533,0.03416,0.094556,,24.387788


In [44]:
#calculate the sum of all features for each comment
non_ironic_feature_prediction = non_ironic_predictor_df.sum(axis=1)

#add final column to ironic predictor df with feature totals
non_ironic_predictor_df["Feature Weight"] = non_ironic_predictor_df.sum(axis=1)
non_ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,SARC SYMB /S,PUNCT RICH,UPPERCASE,PROPN,PERSON,GPE,ORG,!,Feature Weight
0,0.115636,5.805244,0.0,72.301945,11.953081,0.056122,0.036079,0.022975,0.0618,,90.352882
1,0.264808,6.138577,0.0,72.232979,36.953081,,0.036079,0.022975,0.0618,,115.7103
2,0.628808,11.394756,0.0,72.199004,167.953081,0.029172,0.036079,0.022975,0.0618,0.078006,252.40368
3,0.595475,2.805244,0.0,72.301945,14.953081,,0.036079,0.022975,0.0618,,90.776599
4,0.234828,8.194756,0.0,72.260278,55.953081,,0.036079,0.022975,0.0618,,136.763797


In [45]:
#create final df with final predicitons
final_predictordf = pd.DataFrame(non_ironic_feature_prediction)

final_predictordf.columns = ["Non-ironic Feature Result"] + final_predictordf.columns.tolist()[1:]
final_predictordf["Ironic Feature Result"] = ironic_feature_prediction

final_predictordf.head()

Unnamed: 0,Non-ironic Feature Result,Ironic Feature Result
0,90.352882,34.585988
1,115.7103,9.823938
2,252.40368,139.982644
3,90.776599,28.853707
4,136.763797,24.387788


In [46]:
final_prediction = final_predicition_results(final_predictordf)

val["Prediction"] = final_prediction
print(len(val))

# #change order of columns (so label and prediction side by side)
val = val[['Comment_Text','Label','Prediction']]

val.dtypes

195


Comment_Text    object
Label            int64
Prediction       int64
dtype: object

In [47]:
classification_result_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/classification_results.csv"

val.to_csv(classification_result_filename)

# (7) Accuracy Score

In [48]:
accuracy = accuracy(val)
print(accuracy)

0.24102564102564103
