In [1]:
#import/install all packages at the top

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from itertools import chain
from pathlib import Path

In [2]:
#import our own functions from feature_stats.py script
#each imported individually for explicity

from features_stats import get_all_tokens
from features_stats import get_words
from features_stats import get_punct
from features_stats import average_word_length
from features_stats import average_sent_length
from features_stats import check_sarcsymbol
from features_stats import count_uppercase
from features_stats import get_lemmas
from features_stats import get_punct_average
from features_stats import get_sentiment
from features_stats import get_indiv_punct
from features_stats import relative_count_wordtypes
from features_stats import get_entities

from classification_stats import get_classification_ironic
from classification_stats import get_classification_non_ironic
from classification_stats import final_predicition_results
from classification_stats import accuracy

In [3]:
#insert your own file directory path here
file_directory = Path("/Users/laure/OneDrive/Desktop/")

# Sections:

# (1) Import dataset and split

# (2) Functions

# (3) Final Testing (obtain stats ready for comparison)
------- get stats and create summary df
- (3.1) Average Word Count
- (3.2) Average Sentence Count
- (3.3) Punctuation Richness
- (3.4) Sarcasm Symbol
- (3.5) Upper-case Words
- (3.6) (Verb) Lemmas
- (3.7) Sentiment Classification

- (3.8) Individual Punctuation Count
- (3.9) Word Type Count
- (3.10) Named Entity Count

# (4) Additional Functions for Classification
# (5) Classification
# (5) Classification Results
# (6) Accuracy Score

# (1) Import and Split

In [4]:
#Import and Read file as DF with PANDAS (for better visualisation)
gold_label = pd.read_csv(file_directory / "irony-labeled.csv")

In [5]:
#Rename the columns
gold_label.columns = ["Comment_Text", "Label"]

In [6]:
#Split to get two DFs (prep for split)

y = gold_label["Comment_Text"]
x = gold_label["Label"]

In [7]:
#Split the dataset into TEST and TRAIN sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=43)

#Split the TRAIN set again to get VALIDATION set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=43)

In [8]:
#JOIN the series together to get final splits as DFs
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)
val = pd.concat([X_val, y_val], axis=1)

In [9]:
#Check number of comments labelled as ironic vs non-ironic
ironic_test = test[test["Label"] == 1]
nonironic_test = test[test["Label"] == -1]

print(f"Testing data contains {len(ironic_test)} IRONIC comments")
print(f"Testing data contains {len(nonironic_test)} NON- IRONIC comments")

Testing data contains 103 IRONIC comments
Testing data contains 287 NON- IRONIC comments


In [10]:
#Convert TEST set into a dictionary
test_dict = test.set_index(test.index).T.to_dict()

print(len(test_dict))

390


# (3) Final Tetsing
- Obtain all statistics (based on training) ready for comparison in next section

In [11]:
#1) GET ALL TOKENS
tokens = get_all_tokens(test_dict)

In [12]:
#2) Get list of ONLY words (no punct)
word_list = get_words(tokens)

In [13]:
#3) Get list of ONLY punct (no words)
punct_list = get_punct(tokens)

In [14]:
#4) WORD LENGTH
average_word_list = []
for comment in word_list:
    average_word_list.append(average_word_length(comment))

print(len(average_word_list))    
    
#Create DataFrame for Summary of Irony STATS
summary= pd.DataFrame({"Average Word Length": average_word_list})

390


In [15]:
#Create df for total, full returns for irony
total_test= pd.DataFrame({'Comment Parsed':tokens})
total_test["Tokens"] = word_list
total_test["Punctuation"] = punct_list
total_test.head()

Unnamed: 0,Comment Parsed,Tokens,Punctuation
0,"(&, gt;nifty, phenomenon, ,, but, still, usele...","[&, gt;nifty, phenomenon, but, still, useless,...","[,, ., .]"
1,"(The, most, amazing, part, of, that, episode, ...","[The, most, amazing, part, of, that, episode, ...","[., :, -, (, *, *, ), ,, ;, ,, ,, "", ;, "", ......"
2,"(Fuck, that, ,, expand, !, \n\n, How, do, you,...","[Fuck, that, expand, \n\n, How, do, you, think...","[,, !, ?, ., ., .]"
3,"(Or, [, this](http://www, ., amazon, ., com, /...","[Or, this](http://www, amazon, com, The, God, ...","[[, ., ., /, -, -, -, -, ?, -, ), ., .]"
4,"(Indeed, ,, we, 'll, have, to, wait, till, tha...","[Indeed, we, 'll, have, to, wait, till, that, ...","[,, ., .]"


In [16]:
#4) SENTENCE LENGTH
average_sentence_list = []
for x in tokens:
    average_sentence_list.append(average_sent_length(x))

#Add to Summary of Irony STATS df
summary["Average Sentence Length"] = average_sentence_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length
0,5.451613,17.0
1,4.281022,29.25
2,4.327273,12.2
3,7.037037,6.666667
4,3.411765,10.0


In [17]:
#5) /S SYMBOLS

sarcfunc = []
for x in tokens:
    sarcfunc.append(check_sarcsymbol(x))


sarcsymb_list = []        
for l in sarcfunc:
    if len(l) >= 1:
        sarcsymb_list.append(l)
    else:
        sarcsymb_list.append([0])

#Remove list layer 
sarcsymb_list = list(chain.from_iterable(sarcsymb_list))



summary["Average '/s' symbol count"] = sarcsymb_list

In [18]:
#7) UPPER CASE WORDS (total)

uppercase_list = []
for b in tokens:
    uppercase_list.append((count_uppercase(b)))
    
#Remove list layer 
uppercase_list = list(chain.from_iterable(uppercase_list))

summary["Average Upper-case Words"] = uppercase_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length,Average '/s' symbol count,Average Upper-case Words
0,5.451613,17.0,0,0.0
1,4.281022,29.25,0,0.002849
2,4.327273,12.2,0,0.016393
3,7.037037,6.666667,0,0.0
4,3.411765,10.0,0,0.0


In [19]:
#6) PUNCTUATION RICHNESS
average_punct_list = get_punct_average(punct_list, tokens)

summary["Punctuation Richness"] = average_punct_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length,Average '/s' symbol count,Average Upper-case Words,Punctuation Richness
0,5.451613,17.0,0,0.0,20.0
1,4.281022,29.25,0,0.002849,252.5
2,4.327273,12.2,0,0.016393,36.5
3,7.037037,6.666667,0,0.0,33.0
4,3.411765,10.0,0,0.0,13.0


In [20]:
#9) AVERAGE NUMBER OF LEMMAS

lemma_list = []
for doc in tokens:
    lemma_list.append(get_lemmas(doc))
    
summary["Verb Lemma Average"] = lemma_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length,Average '/s' symbol count,Average Upper-case Words,Punctuation Richness,Verb Lemma Average
0,5.451613,17.0,0,0.0,20.0,0.147059
1,4.281022,29.25,0,0.002849,252.5,0.076923
2,4.327273,12.2,0,0.016393,36.5,0.180328
3,7.037037,6.666667,0,0.0,33.0,0.0
4,3.411765,10.0,0,0.0,13.0,0.2


In [21]:
#10 SENTIMENT CLASSIFICATION
#1 = positive, -1 = negative

sentiment = get_sentiment(test_dict)

summary["Sentiment Classification"] = sentiment 

In [22]:
#replace NAN values
summary = summary.replace(np.nan, 0)
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length,Average '/s' symbol count,Average Upper-case Words,Punctuation Richness,Verb Lemma Average,Sentiment Classification
0,5.451613,17.0,0,0.0,20.0,0.147059,1
1,4.281022,29.25,0,0.002849,252.5,0.076923,1
2,4.327273,12.2,0,0.016393,36.5,0.180328,1
3,7.037037,6.666667,0,0.0,33.0,0.0,-1
4,3.411765,10.0,0,0.0,13.0,0.2,1


In [23]:
#Save test general table to CSV
summary.to_csv(file_directory / "test_summary_general.csv")

In [24]:
#8) INDIVIDUAL PUNCTUATION AVERAGE

average_indiv_punc_list = []
for x in tokens:
    average_indiv_punc_list.append(get_indiv_punct(x))


summary_indiv_punct = pd.DataFrame(average_indiv_punc_list)

In [25]:
#replace NAN values

summary_indiv_punct = summary_indiv_punct.replace(np.nan, 0)
pd.options.display.max_columns = 40
summary_indiv_punct.head()

Unnamed: 0,!,"""",#,%,&,',(,),*,",",-,--,.,..,...,....,.....,.......,/,:,:(,:/,;,?,@,[,],_,·,–,—,“,”,…
0,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.022792,0.0,0.0,0.031339,0.019943,0.005698,0.005698,0.02849,0.051282,0.002849,0.0,0.011396,0.0,0.019943,0.0,0.0,0.0,0.0,0.008547,0.0,0.0,0.034188,0.005698,0.0,0.002849,0.002849,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.016393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016393,0.0,0.0,0.04918,0.0,0.0,0.0,0.0,0.0,0.016393,0.0,0.0,0.0,0.0,0.016393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025,0.0,0.0,0.125,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.025,0.0,0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
#Save test punctuation table to CSV
summary_indiv_punct.to_csv(file_directory / "test_summary_puncttype.csv")

In [27]:
#7) WORD TYPE AVERAGE 

average_wordtype_list = []
for comment in tokens:
    average_wordtype_list.append(relative_count_wordtypes(comment))

summary_wordtypedf = pd.DataFrame(average_wordtype_list)

In [28]:
#replace NAN values
summary_wordtypedf = summary_wordtypedf.replace(np.nan, 0)
summary_wordtypedf.head()

Unnamed: 0,ADJ,ADP,ADV,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X
0,0.117647,0.117647,0.029412,0.058824,0.058824,0.0,0.235294,0.029412,0.0,0.058824,0.0,0.088235,0.058824,0.0,0.147059,0.0
1,0.062678,0.08547,0.05698,0.048433,0.065527,0.005698,0.111111,0.008547,0.008547,0.042735,0.076923,0.219373,0.02849,0.0,0.17094,0.008547
2,0.098361,0.04918,0.04918,0.016393,0.098361,0.0,0.196721,0.0,0.016393,0.032787,0.04918,0.098361,0.065574,0.016393,0.213115,0.0
3,0.05,0.025,0.05,0.025,0.025,0.0,0.1,0.0,0.0,0.0,0.225,0.325,0.025,0.1,0.0,0.05
4,0.0,0.15,0.1,0.0,0.1,0.0,0.15,0.0,0.05,0.05,0.0,0.15,0.05,0.0,0.2,0.0


In [29]:
#Save test pos table to CSV
summary_wordtypedf.to_csv(file_directory / "test_summary_pos.csv")

In [30]:
#10) NAMED ENTITIES AVERAGE
named_entity_list = []
for comment in tokens:
    named_entity_list.append(get_entities(comment))
    
summary_named_entity = pd.DataFrame(named_entity_list)

In [31]:
#replace NAN values

summary_named_entity = summary_named_entity.replace(np.nan, 0) 
summary_named_entity.head()

Unnamed: 0,CARDINAL,DATE,EVENT,FAC,GPE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,TIME,WORK_OF_ART
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.008547,0.0,0.0,0.0,0.011396,0.0,0.0,0.0,0.025641,0.0,0.022792,0.0,0.017094,0.0,0.0,0.0
2,0.0,0.0,0.016393,0.0,0.016393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
#Save test general table to CSV
summary_named_entity.to_csv(file_directory / "test_summary_namedentity.csv")

# (4) Additional Functions (Classification)

# (5) Classification

Steps:
- (1) Import Master DF ###(1) GENERAL
- (2) Get Results for each comparison using classification function (1) Ir, (2) Non-ir
- (3) Create PredictorDF for (1) Ironic, (2) Non-ironic

''''''''''''''''''''''''''''''
- (4) Import Master DF ###(2) POS
- (5) Get Results for each comparison using classification function (1) Ir, (2) Non-ir
- (6) Add to each PredictorDF for (1) Ironic, (2) Non-ironic

''''''''''''''''''''''''''''''
- (7) Import Master DF ###(3) NAMED ENTITY
- Repeat steps 5 & 6

''''''''''''''''''''''''''''''
- (8) Import Master DF ###(4) PUNCTUATION
- Repeat steps 5 & 6 

In [33]:
####(1)
#import GENERAL summary table
mastergeneral_df = pd.read_csv(file_directory / "train_summary_general.csv")
mastergeneral_df.head()

mastergeneral_df = mastergeneral_df.rename(columns={mastergeneral_df.columns[0]: "Class"})

mastergeneral_df

Unnamed: 0,Class,Average Word Length,Average Sentence Length,Average '/s' symbol count,Average Upper-case Words,Punctuation Richness,Verb Lemma Average,Sentiment Classification
0,Ironic Comments,4.304183,14.261968,0.000783,0.02535,20.927649,0.154378,0.405685
1,Non-ironic Comments,4.41101,15.805244,0.0,0.02346,36.150972,0.146618,0.455476


In [34]:
#get results from classification function for IRONIC
ironic_average_word_length = get_classification_ironic(mastergeneral_df, summary, 1, 0, 15)
ironic_average_sent_length = get_classification_ironic(mastergeneral_df, summary, 2, 1, 1)
ironic_average_sarcsymb = get_classification_ironic(mastergeneral_df, summary, 3, 2, 1)
ironic_average_uppercase = get_classification_ironic(mastergeneral_df, summary, 4, 3, 1)
ironic_punct_richness = get_classification_ironic(mastergeneral_df, summary, 5, 4, 100)
ironic_average_verblemma = get_classification_ironic(mastergeneral_df, summary, 6, 5, 1)
ironic_average_sentiment = get_classification_ironic(mastergeneral_df, summary, 7, 6, 1)

In [35]:
#IRONIC
#Create PREDICTOR DATAFRAME with classifications (all features)

ironic_predictor_df = pd.DataFrame(ironic_average_word_length)
ironic_predictor_df.columns = ['WORD LENGTH'] + ironic_predictor_df.columns.tolist()[1:]

ironic_predictor_df["SENTENCE LENGTH"] = ironic_average_sent_length
ironic_predictor_df["PUNCT RICH"] = ironic_punct_richness
ironic_predictor_df["SARC SYMB /S"] = ironic_average_sarcsymb
ironic_predictor_df["UPPERCASE"] = ironic_average_uppercase
ironic_predictor_df["Verb Lemma Average"] = ironic_average_verblemma
ironic_predictor_df["Sentiment Classification"] = ironic_average_sentiment

ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,SARC SYMB /S,UPPERCASE,Verb Lemma Average,Sentiment Classification
0,17.211454,2.738032,92.764858,0.000783,0.02535,0.007319,0.594315
1,0.347411,14.988032,23157.235142,0.000783,0.022501,0.077455,0.594315
2,0.346352,2.061968,1557.235142,0.000783,0.008957,0.02595,0.594315
3,40.992816,7.595301,1207.235142,0.000783,0.02535,0.154378,1.405685
4,13.386269,4.261968,792.764858,0.000783,0.02535,0.045622,0.594315


In [36]:
#get results from classification function for NON-IRONIC
non_ironic_average_word_length = get_classification_non_ironic(mastergeneral_df, summary, 1, 0, 15)
non_ironic_average_sent_length= get_classification_non_ironic(mastergeneral_df, summary, 2, 1, 1)
non_ironic_average_sarcsymb= get_classification_non_ironic(mastergeneral_df, summary, 3, 2, 1)
non_ironic_average_uppercase = get_classification_non_ironic(mastergeneral_df, summary, 4, 4, 1)
non_ironic_punct_richness = get_classification_non_ironic(mastergeneral_df, summary, 5, 3, 100)
non_ironic_average_verblemma = get_classification_non_ironic(mastergeneral_df, summary, 6, 5, 1)
non_ironic_average_sentiment = get_classification_non_ironic(mastergeneral_df, summary, 7, 6, 1)

In [37]:
#NON-IRONIC
#Create PREDICTOR DATAFRAME with classifications (all features)

non_ironic_predictor_df = pd.DataFrame(non_ironic_average_word_length)
non_ironic_predictor_df.columns = ['WORD LENGTH'] + non_ironic_predictor_df.columns.tolist()[1:]

non_ironic_predictor_df["SENTENCE LENGTH"] = non_ironic_average_sent_length
non_ironic_predictor_df["SARC SYMB /S"] = non_ironic_average_sarcsymb
non_ironic_predictor_df["PUNCT RICH"] = non_ironic_punct_richness
non_ironic_predictor_df["UPPERCASE"] = non_ironic_average_uppercase
non_ironic_predictor_df["Verb Lemma Average"] = non_ironic_average_verblemma
non_ironic_predictor_df["Sentiment Classification"] = non_ironic_average_sentiment
non_ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,SARC SYMB /S,PUNCT RICH,UPPERCASE,Verb Lemma Average,Sentiment Classification
0,15.609041,1.194756,0.0,3615.097236,19.97654,0.000441,0.544524
1,1.949824,13.444756,0.0,3614.812336,252.47654,0.069695,0.544524
2,1.256062,3.605244,0.0,3613.457892,36.47654,0.03371,0.544524
3,39.390403,9.138577,0.0,3615.097236,32.97654,0.146618,1.455476
4,14.988682,5.805244,0.0,3615.097236,12.97654,0.053382,0.544524


In [38]:
####(2)
#import POS summary table

masterwordtype_df = pd.read_csv(file_directory / "train_summary_pos.csv")
masterwordtype_df.head()

masterwordtype_df = masterwordtype_df.rename(columns={mastergeneral_df.columns[0]: "Class"}) 
masterwordtype_df.head()

Unnamed: 0.1,Unnamed: 0,ADJ,ADP,ADV,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X
0,Ironic,0.08507,0.07827,0.064795,0.017629,0.073488,0.007642,0.164352,0.007932,0.022388,0.063109,0.058805,0.145819,0.03144,0.004067,0.174846,0.000348
1,Non-ironic,0.080089,0.083053,0.070846,0.023582,0.081,0.005765,0.158435,0.009828,0.022023,0.061435,0.043878,0.141197,0.032398,0.005573,0.180089,0.00081


In [39]:
#get results from classification function for IRONIC
#E.g. PRON, PROPN, NOUN

# ironic_PRON_dist= get_classification_ironic(masterwordtype_df, summary_wordtypedf, 10, 9, 1)
# ironic_PROPN_dist_length= get_classification_ironic(masterwordtype_df, summary_wordtypedf, 11, 10, 1)
ironic_NOUN_dist = get_classification_ironic(masterwordtype_df, summary_wordtypedf, 7, 6, 1)
ironic_CCONJ_dist = get_classification_ironic(masterwordtype_df, summary_wordtypedf, 4, 3, 1)
ironic_VERB_dist = get_classification_ironic(masterwordtype_df, summary_wordtypedf, 15, 14, 1)

In [40]:
#Add to IRONIC PREDICTOR DATAFRAME
# ironic_predictor_df["PRON"] = ironic_PRON_dist
# ironic_predictor_df["PROPN"] = ironic_PROPN_dist_length
ironic_predictor_df["NOUN"] = ironic_NOUN_dist
ironic_predictor_df["CCONJ"] = ironic_CCONJ_dist
ironic_predictor_df["VERB"] = ironic_VERB_dist
ironic_predictor_df = ironic_predictor_df.replace(np.nan, 0)
ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,SARC SYMB /S,UPPERCASE,Verb Lemma Average,Sentiment Classification,NOUN,CCONJ,VERB
0,17.211454,2.738032,92.764858,0.000783,0.02535,0.007319,0.594315,0.070942,0.041194,0.027787
1,0.347411,14.988032,23157.235142,0.000783,0.022501,0.077455,0.594315,0.053241,0.030804,0.003905
2,0.346352,2.061968,1557.235142,0.000783,0.008957,0.02595,0.594315,0.032369,0.001236,0.038269
3,40.992816,7.595301,1207.235142,0.000783,0.02535,0.154378,1.405685,0.064352,0.007371,0.174846
4,13.386269,4.261968,792.764858,0.000783,0.02535,0.045622,0.594315,0.014352,0.017629,0.025154


In [41]:
#get results from classification function for NON-IRONIC
#E.g. PRON, PROPN, NOUN

# nonironic_PRON_dist= get_classification_non_ironic(masterwordtype_df, summary_wordtypedf, 10, 9, 1)
# nonironic_PROPN_dist_length= get_classification_non_ironic(masterwordtype_df, summary_wordtypedf, 11, 10, 1)
nonironic_NOUN_dist = get_classification_non_ironic(masterwordtype_df, summary_wordtypedf, 7, 6, 1)
nonironic_CCONJ_dist = get_classification_non_ironic(masterwordtype_df, summary_wordtypedf, 4, 3, 1)
nonironic_VERB_dist = get_classification_non_ironic(masterwordtype_df, summary_wordtypedf, 15, 14, 1)

In [42]:
#Add to NON-IRONIC PREDICTOR DATAFRAME
# non_ironic_predictor_df["PRON"] = nonironic_PRON_dist
# non_ironic_predictor_df["PROPN"] = nonironic_PROPN_dist_length
non_ironic_predictor_df["NOUN"] = nonironic_NOUN_dist
non_ironic_predictor_df["CCONJ"] = nonironic_CCONJ_dist
non_ironic_predictor_df["VERB"] = nonironic_VERB_dist
non_ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,SARC SYMB /S,PUNCT RICH,UPPERCASE,Verb Lemma Average,Sentiment Classification,NOUN,CCONJ,VERB
0,15.609041,1.194756,0.0,3615.097236,19.97654,0.000441,0.544524,0.076859,0.035242,0.03303
1,1.949824,13.444756,0.0,3614.812336,252.47654,0.069695,0.544524,0.047324,0.024851,0.009149
2,1.256062,3.605244,0.0,3613.457892,36.47654,0.03371,0.544524,0.038286,0.007189,0.033026
3,39.390403,9.138577,0.0,3615.097236,32.97654,0.146618,1.455476,0.058435,0.001418,0.180089
4,14.988682,5.805244,0.0,3615.097236,12.97654,0.053382,0.544524,0.008435,0.023582,0.019911


In [43]:
####(3)
#import NER summary table
masterentity_df = pd.read_csv(file_directory / "train_summary_namedentity.csv")
masterentity_df.head()

masterentity_df.rename(columns={mastergeneral_df.columns[0]: "Class"})
masterentity_df = masterentity_df.replace(np.nan, 0)
masterentity_df

Unnamed: 0.1,Unnamed: 0,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,Ironic,0.004144,0.003744,6.6e-05,8e-06,0.006832,0.0,0.000155,0.00082,0.001215,0.00809,0.000965,0.013508,0.001186,0.010383,0.000144,8e-06,0.000418,0.00028
1,Non-ironic,0.004955,0.004464,0.000106,7.8e-05,0.004595,4.4e-05,0.000126,0.000499,0.000849,0.006177,0.000913,0.008829,0.000773,0.00902,4.9e-05,0.000225,0.000444,0.000677


In [44]:
#get results from classification function for IRONIC
#E.g. PERSON, LOC, GPE, LANGUAGE (none)

ironic_PERSON_dist= get_classification_ironic(masterentity_df, summary_named_entity, 14, 13, 1)
ironic_LOC_dist_length= get_classification_ironic(masterentity_df, summary_named_entity, 8, 7, 1)
ironic_GPE_dist = get_classification_ironic(masterentity_df, summary_named_entity, 5, 4, 30)
ironic_LANGUAGE_dist = get_classification_ironic(masterentity_df, summary_named_entity, 6, 5, 1)
ironic_ORG_dist= get_classification_ironic(masterentity_df, summary_named_entity, 12, 11, 1)

In [45]:
#Add to IRONIC PREDICTOR DATAFRAME
ironic_predictor_df["PERSON"] = ironic_PERSON_dist
ironic_predictor_df["LOC"] = ironic_LOC_dist_length
ironic_predictor_df["GPE"] = ironic_GPE_dist
ironic_predictor_df["LANGUAGE"] = ironic_LANGUAGE_dist
ironic_predictor_df["ORG"] = ironic_ORG_dist

ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,SARC SYMB /S,UPPERCASE,Verb Lemma Average,Sentiment Classification,NOUN,CCONJ,VERB,PERSON,LOC,GPE,LANGUAGE,ORG
0,17.211454,2.738032,92.764858,0.000783,0.02535,0.007319,0.594315,0.070942,0.041194,0.027787,0.010383,0.00082,0.204958,0.0,0.013508
1,0.347411,14.988032,23157.235142,0.000783,0.022501,0.077455,0.594315,0.053241,0.030804,0.003905,0.010383,0.00082,0.136922,0.0,0.013508
2,0.346352,2.061968,1557.235142,0.000783,0.008957,0.02595,0.594315,0.032369,0.001236,0.038269,0.010383,0.00082,0.286845,0.0,0.013508
3,40.992816,7.595301,1207.235142,0.000783,0.02535,0.154378,1.405685,0.064352,0.007371,0.174846,0.010383,0.00082,0.204958,0.0,0.013508
4,13.386269,4.261968,792.764858,0.000783,0.02535,0.045622,0.594315,0.014352,0.017629,0.025154,0.010383,0.00082,0.204958,0.0,0.013508


In [46]:
#get results from classification function for NON- IRONIC
#E.g. PERSON, LOC, GPE, LANGUAGE (none)

nonironic_PERSON_dist= get_classification_non_ironic(masterentity_df, summary_named_entity, 14, 13, 1)
nonironic_LOC_dist= get_classification_non_ironic(masterentity_df, summary_named_entity, 8, 7, 1)
nonironic_GPE_dist = get_classification_non_ironic(masterentity_df, summary_named_entity, 5, 4, 30)
nonironic_LANGUAGE_dist = get_classification_non_ironic(masterentity_df, summary_named_entity, 6, 5, 1)
nonironic_ORG_dist = get_classification_non_ironic(masterentity_df, summary_named_entity, 12, 11, 1)

In [47]:
#Add to NON-IRONIC PREDICTOR DATAFRAME
non_ironic_predictor_df["PERSON"] = nonironic_PERSON_dist
non_ironic_predictor_df["LOC"] = nonironic_LOC_dist
non_ironic_predictor_df["GPE"] = nonironic_GPE_dist
non_ironic_predictor_df["LANGUAGE"] = nonironic_LANGUAGE_dist
non_ironic_predictor_df["ORG"] = nonironic_ORG_dist


non_ironic_predictor_df = non_ironic_predictor_df.replace(np.nan, 0)
non_ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,SARC SYMB /S,PUNCT RICH,UPPERCASE,Verb Lemma Average,Sentiment Classification,NOUN,CCONJ,VERB,PERSON,LOC,GPE,LANGUAGE,ORG
0,15.609041,1.194756,0.0,3615.097236,19.97654,0.000441,0.544524,0.076859,0.035242,0.03303,0.00902,0.000499,0.137849,4.4e-05,0.008829
1,1.949824,13.444756,0.0,3614.812336,252.47654,0.069695,0.544524,0.047324,0.024851,0.009149,0.00902,0.000499,0.204031,4.4e-05,0.008829
2,1.256062,3.605244,0.0,3613.457892,36.47654,0.03371,0.544524,0.038286,0.007189,0.033026,0.00902,0.000499,0.353954,4.4e-05,0.008829
3,39.390403,9.138577,0.0,3615.097236,32.97654,0.146618,1.455476,0.058435,0.001418,0.180089,0.00902,0.000499,0.137849,4.4e-05,0.008829
4,14.988682,5.805244,0.0,3615.097236,12.97654,0.053382,0.544524,0.008435,0.023582,0.019911,0.00902,0.000499,0.137849,4.4e-05,0.008829


In [48]:
####(4)
#import PUNCTUATION summary table
masterpunct_df = pd.read_csv(file_directory / "train_summary_puncttype.csv")
masterpunct_df.head()

masterpunct_df = masterpunct_df.rename(columns={mastergeneral_df.columns[0]: "Class"})
# len(masterpunct_df.columns)
pd.options.display.max_columns = 40
masterpunct_df

Unnamed: 0.1,Unnamed: 0,!,"""",""")",#,%,&,',(,),*,",",-,--,---,----------,.,..,...,....,.....,......,.......,/,:,:(,:),:-),;,?,[,\,],_,§,–,—,‘,“,”
0,Ironic,0.009634,0.009414,8e-06,0.0,0.0011,0.001022,0.001277,0.000581,0.000797,0.005973,0.027248,0.005296,0.000122,0.0,0.0,0.058454,0.000622,0.003805,0.00105,0.0,0.0,0.0,0.002371,0.002534,0.000103,0.000161,0.0,0.000786,0.014821,0.00049,8.3e-05,0.000421,4.9e-05,0.0,8e-06,7.6e-05,8e-06,0.000287,0.000315
1,Non-ironic,0.003882,0.008162,0.0,2.8e-05,0.000592,0.001495,0.001599,0.002037,0.003004,0.00382,0.024176,0.006119,0.00013,1e-05,6e-06,0.059713,0.000438,0.003519,0.000866,0.000285,5.3e-05,1e-05,0.004311,0.001178,9.9e-05,0.000154,2.5e-05,0.001531,0.01057,0.004692,0.0,0.003895,1.1e-05,1.2e-05,1.2e-05,2.8e-05,1.7e-05,0.00011,0.000129


In [49]:
#get results from classification function for IRONIC
#E.g. !, ', *, :( 

ironic_exclam_dist = get_classification_ironic(masterpunct_df, summary_indiv_punct, 1, 0, 10)
ironic_apost_dist_length = get_classification_ironic(masterpunct_df, summary_indiv_punct, 7, 6, 1)
ironic_hash_dist_length = get_classification_ironic(masterpunct_df, summary_indiv_punct,4, 3, 8)
# ironic_leftsquare_dist_length = get_classification_ironic(masterpunct_df, summary_indiv_punct, 30, 29, 8)
ironic_star_dist = get_classification_ironic(masterpunct_df, summary_indiv_punct, 10, 9, 1)
# ironic_quest_dist = get_classification_ironic(masterpunct_df, summary_indiv_punct, 29, 28, 1)
ironic_sademoji_dist = get_classification_ironic(masterpunct_df, summary_indiv_punct, 25, 24, 1)

In [50]:
# Add to IRONIC PREDICTOR DATAFRAME
ironic_predictor_df["!"] = ironic_exclam_dist
ironic_predictor_df["'"] = ironic_apost_dist_length
ironic_predictor_df["#"] = ironic_hash_dist_length
ironic_predictor_df["*"] = ironic_star_dist
ironic_predictor_df[":("] = ironic_sademoji_dist
# ironic_predictor_df["["] = ironic_leftsquare_dist_length

ironic_predictor_df



Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,SARC SYMB /S,UPPERCASE,Verb Lemma Average,Sentiment Classification,NOUN,CCONJ,VERB,PERSON,LOC,GPE,LANGUAGE,ORG,!,',#,*,:(
0,17.211454,2.738032,92.764858,0.000783,0.025350,0.007319,0.594315,0.070942,0.041194,0.027787,0.010383,0.000820,0.204958,0.0,0.013508,0.096337,0.001277,0.000000,0.023438,0.000103
1,0.347411,14.988032,23157.235142,0.000783,0.022501,0.077455,0.594315,0.053241,0.030804,0.003905,0.010383,0.000820,0.136922,0.0,0.013508,0.096337,0.004421,0.000000,0.045309,0.000103
2,0.346352,2.061968,1557.235142,0.000783,0.008957,0.025950,0.594315,0.032369,0.001236,0.038269,0.010383,0.000820,0.286845,0.0,0.013508,0.067598,0.001277,0.000000,0.010420,0.000103
3,40.992816,7.595301,1207.235142,0.000783,0.025350,0.154378,1.405685,0.064352,0.007371,0.174846,0.010383,0.000820,0.204958,0.0,0.013508,0.096337,0.001277,0.000000,0.005973,0.000103
4,13.386269,4.261968,792.764858,0.000783,0.025350,0.045622,0.594315,0.014352,0.017629,0.025154,0.010383,0.000820,0.204958,0.0,0.013508,0.096337,0.001277,0.000000,0.044027,0.000103
5,8.312739,4.261968,1392.764858,0.000783,0.025350,0.045622,0.594315,0.164352,0.082371,0.025154,0.010383,0.000820,0.204958,0.0,0.013508,0.096337,0.001277,0.000000,0.094027,0.000103
6,0.544882,2.238032,2207.235142,0.000783,0.010199,0.048318,0.594315,0.058292,0.017629,0.006973,0.010383,0.000820,0.204958,0.0,0.013508,0.096337,0.001277,0.000000,0.024330,0.000103
7,2.169122,5.538032,3357.235142,0.000783,0.015249,0.007238,0.594315,0.027567,0.002573,0.027175,0.010383,0.000820,0.401102,0.0,0.013508,0.096337,0.001277,0.000000,0.005973,0.000103
8,5.867087,12.738032,342.764858,0.000783,0.011687,0.006230,1.405685,0.016204,0.017629,0.026697,0.010383,0.000820,0.204958,0.0,0.013508,0.096337,0.001277,0.000000,0.005973,0.000103
9,8.312739,5.261968,507.235142,0.000783,0.025350,0.095622,0.594315,0.081019,0.037926,0.102932,0.010383,0.000820,0.204958,0.0,0.013508,0.096337,0.001277,0.000000,0.049582,0.000103


In [51]:
#get results from classification function for NON- IRONIC
#E.g. !, ', *, :( 

nonironic_exclam_dist= get_classification_non_ironic(masterpunct_df, summary_indiv_punct, 1, 0, 10)
nonironic_apost_dist_length= get_classification_non_ironic(masterpunct_df, summary_indiv_punct, 7, 6, 1)
nonironic_hash_dist_length = get_classification_non_ironic(masterpunct_df, summary_indiv_punct,4, 3, 8)
nonironic_star_dist = get_classification_non_ironic(masterpunct_df, summary_indiv_punct, 10, 9, 1)
nonironic_sademoji_dist = get_classification_non_ironic(masterpunct_df, summary_indiv_punct, 25, 24, 1)
# nonironic_leftsquare_dist_length = get_classification_non_ironic(masterpunct_df, summary_indiv_punct, 30, 29, 8)

In [52]:
# Add to IRONIC PREDICTOR DATAFRAME
non_ironic_predictor_df["!"] = nonironic_exclam_dist
non_ironic_predictor_df["'"] = nonironic_apost_dist_length
non_ironic_predictor_df["#"] = nonironic_hash_dist_length
non_ironic_predictor_df["*"] = nonironic_star_dist
non_ironic_predictor_df[":("] = nonironic_sademoji_dist
# non_ironic_predictor_df["["] = nonironic_leftsquare_dist_length

non_ironic_predictor_df = non_ironic_predictor_df.replace(np.nan, 0)
non_ironic_predictor_df.head()



Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,SARC SYMB /S,PUNCT RICH,UPPERCASE,Verb Lemma Average,Sentiment Classification,NOUN,CCONJ,VERB,PERSON,LOC,GPE,LANGUAGE,ORG,!,',#,*,:(
0,15.609041,1.194756,0.0,3615.097236,19.97654,0.000441,0.544524,0.076859,0.035242,0.03303,0.00902,0.000499,0.137849,4.4e-05,0.008829,0.038815,0.001599,0.000221,0.025591,9.9e-05
1,1.949824,13.444756,0.0,3614.812336,252.47654,0.069695,0.544524,0.047324,0.024851,0.009149,0.00902,0.000499,0.204031,4.4e-05,0.008829,0.038815,0.004099,0.000221,0.047462,9.9e-05
2,1.256062,3.605244,0.0,3613.457892,36.47654,0.03371,0.544524,0.038286,0.007189,0.033026,0.00902,0.000499,0.353954,4.4e-05,0.008829,0.125119,0.001599,0.000221,0.012573,9.9e-05
3,39.390403,9.138577,0.0,3615.097236,32.97654,0.146618,1.455476,0.058435,0.001418,0.180089,0.00902,0.000499,0.137849,4.4e-05,0.008829,0.038815,0.001599,0.000221,0.00382,9.9e-05
4,14.988682,5.805244,0.0,3615.097236,12.97654,0.053382,0.544524,0.008435,0.023582,0.019911,0.00902,0.000499,0.137849,4.4e-05,0.008829,0.038815,0.001599,0.000221,0.04618,9.9e-05


# (6) Classification Results

In [53]:
#calculate the sum of all features for each comment
ironic_feature_prediction = ironic_predictor_df.sum(axis=1)

#add final column to ironic predictor df with feature totals
ironic_predictor_df["Feature Weight"] = ironic_feature_prediction

ironic_predictor_df = ironic_predictor_df.replace(np.nan, 0)
ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,SARC SYMB /S,UPPERCASE,Verb Lemma Average,Sentiment Classification,NOUN,CCONJ,VERB,PERSON,LOC,GPE,LANGUAGE,ORG,!,',#,*,:(,Feature Weight
0,17.211454,2.738032,92.764858,0.000783,0.02535,0.007319,0.594315,0.070942,0.041194,0.027787,0.010383,0.00082,0.204958,0.0,0.013508,0.096337,0.001277,0.0,0.023438,0.000103,113.83286
1,0.347411,14.988032,23157.235142,0.000783,0.022501,0.077455,0.594315,0.053241,0.030804,0.003905,0.010383,0.00082,0.136922,0.0,0.013508,0.096337,0.004421,0.0,0.045309,0.000103,23173.661393
2,0.346352,2.061968,1557.235142,0.000783,0.008957,0.02595,0.594315,0.032369,0.001236,0.038269,0.010383,0.00082,0.286845,0.0,0.013508,0.067598,0.001277,0.0,0.01042,0.000103,1560.736294
3,40.992816,7.595301,1207.235142,0.000783,0.02535,0.154378,1.405685,0.064352,0.007371,0.174846,0.010383,0.00082,0.204958,0.0,0.013508,0.096337,0.001277,0.0,0.005973,0.000103,1257.989384
4,13.386269,4.261968,792.764858,0.000783,0.02535,0.045622,0.594315,0.014352,0.017629,0.025154,0.010383,0.00082,0.204958,0.0,0.013508,0.096337,0.001277,0.0,0.044027,0.000103,811.507714


In [54]:
#calculate the sum of all features for each comment
non_ironic_feature_prediction = non_ironic_predictor_df.sum(axis=1)

#add final column to ironic predictor df with feature totals
non_ironic_predictor_df["Feature Weight"] = non_ironic_feature_prediction

non_ironic_predictor_df = non_ironic_predictor_df.replace(np.nan, 0)
non_ironic_predictor_df.head()

# non_ironic_feature_prediction

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,SARC SYMB /S,PUNCT RICH,UPPERCASE,Verb Lemma Average,Sentiment Classification,NOUN,CCONJ,VERB,PERSON,LOC,GPE,LANGUAGE,ORG,!,',#,*,:(,Feature Weight
0,15.609041,1.194756,0.0,3615.097236,19.97654,0.000441,0.544524,0.076859,0.035242,0.03303,0.00902,0.000499,0.137849,4.4e-05,0.008829,0.038815,0.001599,0.000221,0.025591,9.9e-05,3652.790236
1,1.949824,13.444756,0.0,3614.812336,252.47654,0.069695,0.544524,0.047324,0.024851,0.009149,0.00902,0.000499,0.204031,4.4e-05,0.008829,0.038815,0.004099,0.000221,0.047462,9.9e-05,3883.692118
2,1.256062,3.605244,0.0,3613.457892,36.47654,0.03371,0.544524,0.038286,0.007189,0.033026,0.00902,0.000499,0.353954,4.4e-05,0.008829,0.125119,0.001599,0.000221,0.012573,9.9e-05,3655.96443
3,39.390403,9.138577,0.0,3615.097236,32.97654,0.146618,1.455476,0.058435,0.001418,0.180089,0.00902,0.000499,0.137849,4.4e-05,0.008829,0.038815,0.001599,0.000221,0.00382,9.9e-05,3698.645589
4,14.988682,5.805244,0.0,3615.097236,12.97654,0.053382,0.544524,0.008435,0.023582,0.019911,0.00902,0.000499,0.137849,4.4e-05,0.008829,0.038815,0.001599,0.000221,0.04618,9.9e-05,3649.760693


In [55]:
#create final df with final predicitons
final_predictordf = pd.DataFrame(non_ironic_feature_prediction)

final_predictordf.columns = ["Non-ironic Feature Result"] + final_predictordf.columns.tolist()[1:]
final_predictordf["Ironic Feature Result"] = ironic_feature_prediction

final_predictordf.head()

Unnamed: 0,Non-ironic Feature Result,Ironic Feature Result
0,3652.790236,113.83286
1,3883.692118,23173.661393
2,3655.96443,1560.736294
3,3698.645589,1257.989384
4,3649.760693,811.507714


In [60]:
#save test feature weight scores as csv
final_predictordf.to_csv(file_directory / "testing_featureweight_results.csv")

In [56]:
final_prediction = final_predicition_results(final_predictordf)

test["Prediction"] = final_prediction
print(len(val))

# #change order of columns (so label and prediction side by side)
test = test[['Comment_Text','Label','Prediction']]

#test.dtypes
test


195


Unnamed: 0,Comment_Text,Label,Prediction
305,"&gt;nifty phenomenon, but still useless nothin...",-1,1
1923,The most amazing part of that episode was when...,-1,-1
861,"Fuck that, expand! \n\nHow do you think the fi...",-1,1
1667,Or [this](http://www. amazon. com/The-God-Delu...,-1,1
1594,"Indeed, we'll have to wait till that video see...",-1,1
178,"But hey, we're not here to generalize.",1,1
1340,This is about how I read it. The author is s...,-1,1
999,I like this video but not as much as the one w...,1,1
1872,What about the money BO is taking out of the F...,-1,1
1873,yup.. if they really didnt think it would wor...,-1,1


In [57]:
#save final test classification scores as csv
test.to_csv(file_directory / "testing_classification_results.csv")

# (7) Accuracy Score

In [58]:
accuracy = accuracy(test)
print(accuracy)

0.37948717948717947
