In [1]:
#import/install all packages at the top

import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
nlp = spacy.load("en_core_web_sm")
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
from spacy.lang.en.stop_words import STOP_WORDS
import string

# Sections:

# (1) Import dataset and split

# (2) Functions

# (3) Validation Testing (obtain stats ready for comparison)
------- get stats and create summary df
- (3.1) Average Word Count
- (3.2) Average Sentence Count
- (3.3) Punctuation Richness
- (3.4) Sarcasm Symbol
- (3.5) Upper-case Words

- (3.6) Individual Punctuation Count
- (3.7) Word Type Count
- (3.8) Named Entity Count

# (4) Additional Functions for Classification
# (5) Classification
# (5) Classification Results
# (6) Accuracy Score

# (1) Import and Split

In [2]:
#Import and Read file as DF with PANDAS (for better visualisation)
filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/irony-labeled.csv"
gold_label = pd.read_csv(filename)

In [3]:
#Rename the columns
gold_label.columns = ["Comment_Text", "Label"]

In [4]:
#Split to get two DFs (prep for split)

y = gold_label["Comment_Text"]
x = gold_label["Label"]

In [5]:
#Split the dataset into TEST and TRAIN sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=43)

#Split the TRAIN set again to get VALIDATION set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=43)

In [6]:
#JOIN the series together to get final splits as DFs
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)
val = pd.concat([X_val, y_val], axis=1)

In [7]:
#Check number of comments labelled as ironic vs non-ironic
ironic_val = val[val["Label"] == 1]
nonironic_val = val[val["Label"] == -1]

print(f"Training data contains {len(ironic_val)} IRONIC comments")
print(f"Training data contains {len(nonironic_val)} NON- IRONIC comments")

Training data contains 47 IRONIC comments
Training data contains 148 NON- IRONIC comments


In [8]:
#Convert TEST(validation) set into a dictionary
val_dict = val.set_index(val.index).T.to_dict()

print(len(val_dict))

195


# (2) FUNCTIONS - to be saved to python script as module

In [9]:
def get_all_tokens(test_dict):
    """Input dictionary and return list of comments as SpaCy docs"""
    comment_list = []
    for comment_index, label in test_dict.items():
        for key in label:
            text = label[key]
            if type(text) == str:
                comment_list.append(nlp(text))
    return comment_list

In [10]:
def get_words(listx):
    """Take a list (already parsed through SpaCy) remove punctuation and return list of word tokens"""
    ir_clean_docs = [] #remove punctuation

    for x in listx:
        clean_list = []
        for y in x:
            if y.pos_ != 'PUNCT':
                clean_list.append(y)
        ir_clean_docs.append(clean_list)
    return ir_clean_docs

In [11]:
def get_punct(listx):
    """Take a list (already parsed through spacy), remove words and return list of punctuation ONLY"""
    ir_punct = [] #only punctuation

    for x in listx:
        clean_list = []
        for y in x:
            if y.pos_ == 'PUNCT':
                clean_list.append(y)
        ir_punct.append(clean_list)
    return ir_punct

In [12]:
def average_word_length(doc):
    """Take doc and return average word length"""
    for token in doc:
        word = token.text
        average_word_length = sum(len(word) for word in doc) / len(doc)
    return(average_word_length)

In [13]:
def average_sent_length(doc):
    """Take doc and return average sentence length"""
    sent_list = []

    for sent in doc.sents:
        len_sent = len(sent)
        sent_list.append(len_sent)

    total = sum(sent_list)
    leng = len(sent_list)

    average_sent_length = total / leng
    return(average_sent_length)

In [14]:
def relative_count_wordtypes(doc):
    """Return relative count average for all word types i.e. nouns, pronouns, verbs etc with word type as key and average as value"""
    pos_tags = []
    for token in doc:
        pos_tags.append(token.pos_)
    counting = Counter(pos_tags) #returns dictionary with whole count for each word type in doc
    
    leng = len(doc) #overall length of doc (no. of tokens)
    new_dict = {}
    
    for key, value in counting.items(): #iterate over entire dict
        new_dict[key] = value/ leng
            
            
    return new_dict

In [15]:
def check_sarcsymbol(doc):
    """Take a list of comments (parsed through SpaCy); return list with average number of "/s" symbols per comment [Reddit "/s" = sarcasm]"""
    sarcsymb = []
    leng = len(doc) 
    h = 1
    
    for x in doc: 
        if x.text == "/s" or x.text == "/sarcasm" or x.text == "/sarc":
            sarcsymb.append(h/leng) 
        else:
            pass
               
    return sarcsymb

In [16]:
def get_punct_average(punctuation_list, token_comment_list):
    """Take preprocessed list of punctuation and full token list (MUST be of equal length); 
    Returns list of the average for ALL punctuation (based on number overall of tokens)
    for each comment""" 

    punct_count = []
    for comment in punctuation_list:
        punct_count.append(len(comment))

    len_comment = []
    for comment in token_comment_list:
        len_comment.append(len(comment))
    
    punct_count, len_comment = np.array(punct_count), np.array(len_comment) 
    averages = punct_count + len_comment/2
    return averages

In [17]:
def get_indiv_punct(doc):
    """Return relative count average for all word types i.e. nouns, pronouns, verbs etc with word type as key and average as value"""
    punc_tags = []
    for token in doc:
        if token.is_punct:
            punc_tags.append(token)
            
    
    #make each a string so not multiple keys with same vaues
    punc_tags = [str(punc) for punc in punc_tags]
           

    punc_tag_dict = Counter(punc_tags) #returns dictionary with whole count for each word type in doc
    
    leng = len(doc) #overall length of doc (no. of tokens)
    new_dict = {}
    
    for key, value in punc_tag_dict.items(): #iterate over entire dict
        new_dict[key] = value/ leng
            
    final_dict = dict(new_dict)
            
    return final_dict

In [18]:
def count_uppercase(doc):
    """Take nlp doc and return the average number of fully uppercase words for each comment as a list"""
    new_list = []
    
    leng = len(doc)
    for token in doc:
        if token.is_upper == True:
            new_list.append(token)
            
    counting = Counter(new_list)
    my_dict = dict(counting)
    
    upper_count_avg = []

    x = sum(my_dict.values())
    upper_count_avg.append(x/leng)
    
    return upper_count_avg

In [19]:
def get_entities(doc):
    """Take nlp doc and return a dictionary with key as ent.labe_ and value as the average number"""
    entity = []
    for token in doc.ents:
        entity.append(token.label_)

    new_dict = Counter(entity)
    leng = len(doc)
    
    for key, value in new_dict.items():
        new_dict[key] = value / leng
        
    ent_dict = dict(new_dict)
    
    return ent_dict



# (3) Validation Tetsing
- Obtain all statistics (based on training) ready for comparison in next section

In [20]:
#1) GET ALL TOKENS
tokens = get_all_tokens(val_dict)

In [21]:
#2) Get list of ONLY words (no punct)
word_list = get_words(tokens)

In [22]:
#3) Get list of ONLY punct (no words)
punct_list = get_punct(tokens)

In [23]:
#4) WORD LENGTH
average_word_list = []
for comment in word_list:
    average_word_list.append(average_word_length(comment))

print(len(average_word_list))    
    
#Create DataFrame for Summary of Irony STATS
summary= pd.DataFrame({"Average Word Length": average_word_list})

195


In [24]:
#Create df for total, full returns for irony
total_val= pd.DataFrame({'Comment Parsed':tokens})
total_val["Tokens"] = word_list
total_val["Punctuation"] = punct_list
total_val.head()

Unnamed: 0,Comment Parsed,Tokens,Punctuation
0,"(Cola, costs, more, per, litre, than, petrol, ...","[Cola, costs, more, per, litre, than, petrol, ...",[.]
1,"(Damn, ., , I, ca, n't, believe, this, book,...","[Damn, , I, ca, n't, believe, this, book, se...","[., ., ,, .]"
2,"(My, mother, -, in, -, law, watches, way, to, ...","[My, mother, in, law, watches, way, to, much, ...","[-, -, ,, ,, ,, ., ,, ., ,, "", ,, ,, !, !, "", .]"
3,"(Do, you, know, what, type, of, salt, was, use...","[Do, you, know, what, type, of, salt, was, use...",[?]
4,"(Also, ,, I, 'm, pretty, sure, surveys, have, ...","[Also, I, 'm, pretty, sure, surveys, have, sho...","[,, ., ,, .]"


In [25]:
#SENTENCE LENGTH
average_sentence_list = []
for x in tokens:
    average_sentence_list.append(average_sent_length(x))

#Add to Summary of Irony STATS df
summary["Average Sentence Length"] = average_sentence_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length
0,4.555556,10.0
1,4.08,9.666667
2,3.625,27.2
3,3.666667,13.0
4,4.704545,24.0


In [26]:
#/S SYMBOLS

sarcfunc = []
for x in tokens:
    sarcfunc.append(check_sarcsymbol(x))


sarcsymb_list = []        
for l in sarcfunc:
    if len(l) >= 1:
        sarcsymb_list.append(l)
    else:
        sarcsymb_list.append([0])

#Remove list layer 
sarcsymb_list = list(chain.from_iterable(sarcsymb_list))



summary["sarcsymb"] = sarcsymb_list

In [27]:
#PUNCTUATION RICHNESS
average_punct_list = get_punct_average(punct_list, tokens)

summary["Punctuation Richness"] = average_punct_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length,sarcsymb,Punctuation Richness
0,4.555556,10.0,0.0,6.0
1,4.08,9.666667,0.0,18.5
2,3.625,27.2,0.0,84.0
3,3.666667,13.0,0.0,7.5
4,4.704545,24.0,0.0,28.0


In [28]:
#WORD TYPE AVERAGE 

average_wordtype_list = []
for comment in tokens:
    average_wordtype_list.append(relative_count_wordtypes(comment))

summary_wordtypedf = pd.DataFrame(average_wordtype_list)
summary_wordtypedf.head()

Unnamed: 0,ADJ,ADP,ADV,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X
0,0.1,0.2,0.2,,,,0.2,,,,0.1,0.1,,,0.1,
1,,0.068966,0.068966,0.034483,0.068966,0.034483,0.172414,,,0.103448,,0.137931,0.068966,,0.241379,
2,0.073529,0.088235,0.029412,0.036765,0.088235,,0.161765,0.014706,0.022059,0.088235,0.014706,0.117647,0.022059,0.007353,0.227941,0.007353
3,0.076923,0.153846,,,,,0.307692,,,0.076923,,0.076923,,,0.307692,
4,0.0625,0.125,0.083333,0.041667,0.020833,,0.1875,,0.041667,0.0625,,0.083333,0.020833,,0.270833,


In [29]:
#INDIVIDUAL PUNCTUATION AVERAGE

average_indiv_punc_list = []
for x in tokens:
    average_indiv_punc_list.append(get_indiv_punct(x))


summary_indiv_punct = pd.DataFrame(average_indiv_punc_list)
summary_indiv_punct.head()

Unnamed: 0,!,"""",#,%,&,',(,),*,",",...,:,:),;,?,[,],_,–,“,”
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,0.034483,...,,,,,,,,,,
2,0.014706,0.014706,,,,,,,,0.051471,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,0.076923,,,,,,
4,,,,,,,,,,0.041667,...,,,,,,,,,,


In [30]:
#UPPER CASE WORDS (total)

uppercase_list = []
for b in tokens:
    uppercase_list.append((count_uppercase(b)))
    
#Remove list layer 
uppercase_list = list(chain.from_iterable(uppercase_list))

summary["Uppercase Average"] = uppercase_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length,sarcsymb,Punctuation Richness,Uppercase Average
0,4.555556,10.0,0.0,6.0,0.0
1,4.08,9.666667,0.0,18.5,0.034483
2,3.625,27.2,0.0,84.0,0.051471
3,3.666667,13.0,0.0,7.5,0.0
4,4.704545,24.0,0.0,28.0,0.020833


In [31]:
#GET IR ENTITIES 
named_entity_list = []
for comment in tokens:
    named_entity_list.append(get_entities(comment))
    
summary_named_entity = pd.DataFrame(named_entity_list)

summary_named_entity = summary_named_entity.replace(np.nan, 0) 
summary_named_entity.head()

Unnamed: 0,CARDINAL,DATE,EVENT,FAC,GPE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.007353,0.014706,0.0,0.0,0.0,0.0,0.0,0.007353,0.0,0.0,0.007353,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020833,0.0,0.0,0.0,0.0,0.0,0.0


# (4) Additional Functions (Classification)

In [32]:
def get_classification_ironic(masterdf, newdf, mastercolumnindex_number, newcolumnindexnumber, weight):
    """Compares two columns of two dataframes with row index as 0, 
    based on indices inputted; calculates the difference between the two values and multiples by weight assigned.
    Returns list with new feature values"""
    
    ironic_average = masterdf.iloc[0][mastercolumnindex_number]
      
    #access column ONLY and all rows
    x = list(newdf.iloc[:,newcolumnindexnumber])

    new_list = []
    
    for item in x:
        new_list.append(abs(ironic_average - item)*weight)

        
    return new_list

In [33]:
def get_classification_non_ironic(masterdf, newdf, mastercolumnindex_number, newcolumnindexnumber, weight):
    """Compares two columns of two dataframes with row index as 1,
    based on indices inputted; calculates the difference between the two values 
    and multiples by the weight assigned.
    Returns list with new feature values"""
    
    non_ironic_avergae = masterdf.iloc[1][mastercolumnindex_number]
      
    #access column ONLY and all rows
    x = list(newdf.iloc[:,newcolumnindexnumber])

    new_list = []
    
    for item in x:
        new_list.append(abs(non_ironic_avergae - item)*weight)

        
    return new_list

In [34]:
def final_predicition_results(feature_resultdf):
    """Take feature dataframe and return list with final classification label"""
    
    list_of_tuple_results = [tuple(x) for x in feature_resultdf.to_records(index=False)]
    
    prediciton_list = []
    
    for tup in list_of_tuple_results:
        non_ironic, ironic = tup
    
        if non_ironic > ironic:
            prediciton_list.append("1") #ironic
                
        elif non_ironic < ironic:
            prediciton_list.append("-1") #non-ironic
    
    return prediciton_list 

In [36]:
def accuracy(testdf):
    """Compares labelled data with prediction and calculates accuracy of classification"""
    
    label = list(testdf.iloc[:,0])
    prediciton = list(testdf.iloc[:,2])
    list_of_tuple_evaluations = list(zip(label, prediciton))
    
    gold_match = []
    no_match = []
    
    for tup in list_of_tuple_evaluations:
        label, prediction = tup
        
        if label == int(prediction):
            gold_match.append("True") #gold label match
                
        elif label != int(prediction):
            no_match.append("False")
            
    total_leng = len(testdf)
    gold_leng = len(gold_match)
    
    accuracy = gold_leng / total_leng  
    
    return accuracy
    

# (5) Classification

Steps:
- (1) Import Master DF ###(1) GENERAL
- (2) Get Results for each comparison using classification function (1) Ir, (2) Non-ir
- (3) Create PredictorDF for (1) Ironic, (2) Non-ironic

''''''''''''''''''''''''''''''
- (4) Import Master DF ###(2) POS
- (5) Get Results for each comparison using classification function (1) Ir, (2) Non-ir
- (6) Add to each PredictorDF for (1) Ironic, (2) Non-ironic

''''''''''''''''''''''''''''''
- (7) Import Master DF ###(3) NAMED ENTITY
- Repeat steps 5 & 6

''''''''''''''''''''''''''''''
- (8) Import Master DF ###(4) PUNCTUATION
- Repeat steps 5 & 6 

In [37]:
####(1)
#import GENERAL summary table
master_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_general.csv"
mastergeneral_df = pd.read_csv(master_filename)
mastergeneral_df.head()

mastergeneral_df = mastergeneral_df.rename(columns={mastergeneral_df.columns[0]: "Class"})

In [38]:
#get results from classification function for IRONIC
ironic_average_word_length = get_classification_ironic(mastergeneral_df, summary, 1, 0, 0.8)
ironic_average_sent_length= get_classification_ironic(mastergeneral_df, summary, 2, 1, 1)
ironic_average_sarcsymb= get_classification_ironic(mastergeneral_df, summary, 3, 2, 7)
ironic_average_uppercase = get_classification_ironic(mastergeneral_df, summary, 4, 4, 1)
ironic_punct_richness = get_classification_ironic(mastergeneral_df, summary, 5, 3, 2)

In [39]:
#IRONIC
#Create PREDICTOR DATAFRAME with classifications (all features)

ironic_predictor_df = pd.DataFrame(ironic_average_word_length)
ironic_predictor_df.columns = ['WORD LENGTH'] + ironic_predictor_df.columns.tolist()[1:]

ironic_predictor_df["SENTENCE LENGTH"] = ironic_average_sent_length
ironic_predictor_df["PUNCT RICH"] = ironic_punct_richness
ironic_predictor_df["SARC SYMB /S"] = ironic_average_sarcsymb
ironic_predictor_df["UPPERCASE"] = ironic_average_uppercase

ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,SARC SYMB /S,UPPERCASE
0,0.201098,4.261968,29.855297,0.005481,0.02535
1,0.179346,4.595301,4.855297,0.005481,0.009132
2,0.543346,12.938032,126.144703,0.005481,0.02612
3,0.510013,1.261968,26.855297,0.005481,0.02535
4,0.32029,9.738032,14.144703,0.005481,0.004517


In [40]:
#get results from classification function for NON-IRONIC
non_ironic_average_word_length = get_classification_non_ironic(mastergeneral_df, summary, 1, 0, 0.8)
non_ironic_average_sent_length= get_classification_non_ironic(mastergeneral_df, summary, 2, 1, 1)
non_ironic_average_sarcsymb= get_classification_non_ironic(mastergeneral_df, summary, 3, 2, 7)
non_ironic_average_uppercase = get_classification_non_ironic(mastergeneral_df, summary, 4, 4, 1)
non_ironic_punct_richness = get_classification_non_ironic(mastergeneral_df, summary, 5, 3, 2)

In [41]:
#NON-IRONIC
#Create PREDICTOR DATAFRAME with classifications (all features)

non_ironic_predictor_df = pd.DataFrame(non_ironic_average_word_length)
non_ironic_predictor_df.columns = ['WORD LENGTH'] + non_ironic_predictor_df.columns.tolist()[1:]

non_ironic_predictor_df["SENTENCE LENGTH"] = non_ironic_average_sent_length
non_ironic_predictor_df["SARC SYMB /S"] = non_ironic_average_sarcsymb
non_ironic_predictor_df["PUNCT RICH"] = non_ironic_punct_richness
non_ironic_predictor_df["UPPERCASE"] = non_ironic_average_uppercase
non_ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,SARC SYMB /S,PUNCT RICH,UPPERCASE
0,0.115636,5.805244,0.0,60.301945,0.02346
1,0.264808,6.138577,0.0,35.301945,0.011023
2,0.628808,11.394756,0.0,95.698055,0.028011
3,0.595475,2.805244,0.0,57.301945,0.02346
4,0.234828,8.194756,0.0,16.301945,0.002626


In [42]:
####(2)
#import POS summary table
master_wordtype_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_wordtype.csv"
masterwordtype_df = pd.read_csv(master_wordtype_filename)
masterwordtype_df.head()

masterwordtype_df = masterwordtype_df.rename(columns={mastergeneral_df.columns[0]: "Class"}) 

In [43]:
#get results from classification function for IRONIC
#E.g. PRON, PROPN, NOUN

ironic_PRON_dist= get_classification_ironic(masterwordtype_df, summary_wordtypedf, 10, 9, 1)
ironic_PROPN_dist_length= get_classification_ironic(masterwordtype_df, summary_wordtypedf, 11, 10, 1)
ironic_NOUN_dist = get_classification_ironic(masterwordtype_df, summary_wordtypedf, 7, 6, 1)

In [44]:
#Add to IRONIC PREDICTOR DATAFRAME
ironic_predictor_df["PRON"] = ironic_PRON_dist
ironic_predictor_df["PROPN"] = ironic_PROPN_dist_length
ironic_predictor_df["NOUN"] = ironic_NOUN_dist
ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,SARC SYMB /S,UPPERCASE,PRON,PROPN,NOUN
0,0.201098,4.261968,29.855297,0.005481,0.02535,,0.002975,0.022829
1,0.179346,4.595301,4.855297,0.005481,0.009132,0.012317,,0.004757
2,0.543346,12.938032,126.144703,0.005481,0.02612,0.002896,0.088269,0.015406
3,0.510013,1.261968,26.855297,0.005481,0.02535,0.014209,,0.130521
4,0.32029,9.738032,14.144703,0.005481,0.004517,0.028632,,0.010329


In [45]:
#get results from classification function for NON-IRONIC
#E.g. PRON, PROPN, NOUN

nonironic_PRON_dist= get_classification_non_ironic(masterwordtype_df, summary_wordtypedf, 10, 9, 1)
nonironic_PROPN_dist_length= get_classification_non_ironic(masterwordtype_df, summary_wordtypedf, 11, 10, 1)
nonironic_NOUN_dist = get_classification_non_ironic(masterwordtype_df, summary_wordtypedf, 7, 6, 1)

In [46]:
#Add to NON-IRONIC PREDICTOR DATAFRAME
non_ironic_predictor_df["PRON"] = nonironic_PRON_dist
non_ironic_predictor_df["PROPN"] = nonironic_PROPN_dist_length
non_ironic_predictor_df["NOUN"] = nonironic_NOUN_dist
non_ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,SARC SYMB /S,PUNCT RICH,UPPERCASE,PRON,PROPN,NOUN
0,0.115636,5.805244,0.0,60.301945,0.02346,,0.024394,0.028771
1,0.264808,6.138577,0.0,35.301945,0.011023,0.024472,,0.001184
2,0.628808,11.394756,0.0,95.698055,0.028011,0.009259,0.0609,0.009465
3,0.595475,2.805244,0.0,57.301945,0.02346,0.002053,,0.136463
4,0.234828,8.194756,0.0,16.301945,0.002626,0.016476,,0.016271


In [47]:
####(3)
#import NER summary table
master_ner_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_namedentity.csv"
masterentity_df = pd.read_csv(master_ner_filename)
masterentity_df.head()

masterentity_df.rename(columns={mastergeneral_df.columns[0]: "Class"})
masterentity_df = masterentity_df.replace(np.nan, 0)
masterentity_df

Unnamed: 0.1,Unnamed: 0,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,Ironic,0.041123,0.045283,0.025641,0.003165,0.048072,0.0,0.015036,0.039665,0.036168,0.044726,0.033967,0.055027,0.045914,0.047837,0.01114,0.003165,0.03235,0.027053
1,Non-ironic,0.034334,0.023964,0.009453,0.012705,0.029929,0.021429,0.02059,0.019512,0.025125,0.034095,0.017485,0.032427,0.025188,0.03766,0.009576,0.021969,0.01667,0.018885


In [48]:
#get results from classification function for IRONIC
#E.g. PERSON, LOC, GPE, LANGUAGE (none)

ironic_PERSON_dist= get_classification_ironic(masterentity_df, summary_named_entity, 14, 13, 1)
ironic_LOC_dist_length= get_classification_ironic(masterentity_df, summary_named_entity, 8, 7, 1)
ironic_GPE_dist = get_classification_ironic(masterentity_df, summary_named_entity, 5, 4, 1)
ironic_LANGUAGE_dist = get_classification_ironic(masterentity_df, summary_named_entity, 6, 5, 1)

In [49]:
#Add to IRONIC PREDICTOR DATAFRAME
ironic_predictor_df["PERSON"] = ironic_PERSON_dist
ironic_predictor_df["LOC"] = ironic_LOC_dist_length
ironic_predictor_df["GPE"] = ironic_GPE_dist
ironic_predictor_df["LANGUAGE"] = ironic_LANGUAGE_dist
ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,SARC SYMB /S,UPPERCASE,PRON,PROPN,NOUN,PERSON,LOC,GPE,LANGUAGE
0,0.201098,4.261968,29.855297,0.005481,0.02535,,0.002975,0.022829,0.047837,0.039665,0.048072,0.0
1,0.179346,4.595301,4.855297,0.005481,0.009132,0.012317,,0.004757,0.047837,0.039665,0.048072,0.0
2,0.543346,12.938032,126.144703,0.005481,0.02612,0.002896,0.088269,0.015406,0.047837,0.032312,0.048072,0.0
3,0.510013,1.261968,26.855297,0.005481,0.02535,0.014209,,0.130521,0.047837,0.039665,0.048072,0.0
4,0.32029,9.738032,14.144703,0.005481,0.004517,0.028632,,0.010329,0.047837,0.039665,0.048072,0.0


In [50]:
#get results from classification function for IRONIC
#E.g. PERSON, LOC, GPE, LANGUAGE (none)

nonironic_PERSON_dist= get_classification_non_ironic(masterentity_df, summary_named_entity, 14, 13, 1)
nonironic_LOC_dist= get_classification_non_ironic(masterentity_df, summary_named_entity, 8, 7, 1)
nonironic_GPE_dist = get_classification_non_ironic(masterentity_df, summary_named_entity, 5, 4, 1)
nonironic_LANGUAGE_dist = get_classification_non_ironic(masterentity_df, summary_named_entity, 6, 5, 1)

In [51]:
#Add to NON-IRONIC PREDICTOR DATAFRAME
non_ironic_predictor_df["PERSON"] = nonironic_PERSON_dist
non_ironic_predictor_df["LOC"] = nonironic_LOC_dist
non_ironic_predictor_df["GPE"] = nonironic_GPE_dist
non_ironic_predictor_df["LANGUAGE"] = nonironic_LANGUAGE_dist
non_ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,SARC SYMB /S,PUNCT RICH,UPPERCASE,PRON,PROPN,NOUN,PERSON,LOC,GPE,LANGUAGE
0,0.115636,5.805244,0.0,60.301945,0.02346,,0.024394,0.028771,0.03766,0.019512,0.029929,0.021429
1,0.264808,6.138577,0.0,35.301945,0.011023,0.024472,,0.001184,0.03766,0.019512,0.029929,0.021429
2,0.628808,11.394756,0.0,95.698055,0.028011,0.009259,0.0609,0.009465,0.03766,0.012159,0.029929,0.021429
3,0.595475,2.805244,0.0,57.301945,0.02346,0.002053,,0.136463,0.03766,0.019512,0.029929,0.021429
4,0.234828,8.194756,0.0,16.301945,0.002626,0.016476,,0.016271,0.03766,0.019512,0.029929,0.021429


In [52]:
####(4)
#import PUNCTUATION summary table
master_punct_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_puncttype.csv"
masterpunct_df = pd.read_csv(master_punct_filename)
masterpunct_df.head()

masterpunct_df = masterpunct_df.rename(columns={mastergeneral_df.columns[0]: "Class"})
pd.options.display.max_columns = 40
masterpunct_df

Unnamed: 0.1,Unnamed: 0,!,"""",""")",#,%,&,',(,),*,",",-,--,---,----------,.,..,...,....,.....,......,.......,/,:,:(,:),:-),;,?,[,\,],_,§,–,—,‘,“,”
0,Ironic,0.093206,0.063918,0.003165,0.0,0.047312,0.020822,0.070586,0.017297,0.018151,0.092469,0.059577,0.048803,0.023541,0.0,0.0,0.076684,0.06015,0.050782,0.036945,0.0,0.0,0.0,0.050977,0.037716,0.04,0.0625,0.0,0.023393,0.067477,0.023683,0.032258,0.032581,0.018987,0.0,0.003165,0.029412,0.003165,0.018484,0.020308
1,Non-ironic,0.084272,0.05079,0.0,0.027027,0.022251,0.020579,0.053885,0.018097,0.021424,0.055709,0.044991,0.040669,0.018135,0.009804,0.006079,0.070459,0.053516,0.040926,0.060428,0.092655,0.025933,0.009434,0.04434,0.014209,0.032215,0.075321,0.012393,0.021373,0.049411,0.076399,0.0,0.122762,0.011111,0.011765,0.011765,0.009143,0.008478,0.011914,0.01263


In [53]:
#get results from classification function for IRONIC
#E.g. !, ', *, :( 

ironic_exclam_dist= get_classification_ironic(masterentity_df, summary_indiv_punct, 1, 0, 1)
ironic_apost_dist_length= get_classification_ironic(masterentity_df, summary_indiv_punct, 7, 6, 1)
ironic_star_dist = get_classification_ironic(masterentity_df, summary_indiv_punct, 10, 9, 1)
# ironic_sademoji_dist = get_classification_ironic(masterentity_df, summary_indiv_punct, 25, 24, 1)

In [54]:
#Add to IRONIC PREDICTOR DATAFRAME
ironic_predictor_df["!"] = ironic_exclam_dist
ironic_predictor_df["'"] = ironic_apost_dist_length
ironic_predictor_df["*"] = ironic_star_dist
# ironic_predictor_df["LANGUAGE"] = ironic_LANGUAGE_dist
ironic_predictor_df.head()



Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,SARC SYMB /S,UPPERCASE,PRON,PROPN,NOUN,PERSON,LOC,GPE,LANGUAGE,!,',*
0,0.201098,4.261968,29.855297,0.005481,0.02535,,0.002975,0.022829,0.047837,0.039665,0.048072,0.0,,,
1,0.179346,4.595301,4.855297,0.005481,0.009132,0.012317,,0.004757,0.047837,0.039665,0.048072,0.0,,,0.010243
2,0.543346,12.938032,126.144703,0.005481,0.02612,0.002896,0.088269,0.015406,0.047837,0.032312,0.048072,0.0,0.026417,,0.006744
3,0.510013,1.261968,26.855297,0.005481,0.02535,0.014209,,0.130521,0.047837,0.039665,0.048072,0.0,,,
4,0.32029,9.738032,14.144703,0.005481,0.004517,0.028632,,0.010329,0.047837,0.039665,0.048072,0.0,,,0.00306


In [55]:
#get results from classification function for NON- IRONIC
#E.g. !, ', *, :( 

nonironic_exclam_dist= get_classification_non_ironic(masterentity_df, summary_indiv_punct, 1, 0, 1)
nonironic_apost_dist_length= get_classification_non_ironic(masterentity_df, summary_indiv_punct, 7, 6, 1)
nonironic_star_dist = get_classification_non_ironic(masterentity_df, summary_indiv_punct, 10, 9, 1)
# ironic_sademoji_dist = get_classification_ironic(masterentity_df, summary_indiv_punct, 25, 24, 1)

In [56]:
#Add to IRONIC PREDICTOR DATAFRAME
non_ironic_predictor_df["!"] = nonironic_exclam_dist
non_ironic_predictor_df["'"] = nonironic_apost_dist_length
non_ironic_predictor_df["*"] = nonironic_star_dist
# ironic_predictor_df["LANGUAGE"] = ironic_LANGUAGE_dist
non_ironic_predictor_df.head()



Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,SARC SYMB /S,PUNCT RICH,UPPERCASE,PRON,PROPN,NOUN,PERSON,LOC,GPE,LANGUAGE,!,',*
0,0.115636,5.805244,0.0,60.301945,0.02346,,0.024394,0.028771,0.03766,0.019512,0.029929,0.021429,,,
1,0.264808,6.138577,0.0,35.301945,0.011023,0.024472,,0.001184,0.03766,0.019512,0.029929,0.021429,,,0.000388
2,0.628808,11.394756,0.0,95.698055,0.028011,0.009259,0.0609,0.009465,0.03766,0.012159,0.029929,0.021429,0.019629,,0.017376
3,0.595475,2.805244,0.0,57.301945,0.02346,0.002053,,0.136463,0.03766,0.019512,0.029929,0.021429,,,
4,0.234828,8.194756,0.0,16.301945,0.002626,0.016476,,0.016271,0.03766,0.019512,0.029929,0.021429,,,0.007572


# (6) Classification Results

In [57]:
#calculate the sum of all features for each comment
ironic_feature_prediction = ironic_predictor_df.sum(axis=1)

#add final column to ironic predictor df with feature totals
ironic_predictor_df["Feature Weight"] = ironic_predictor_df.sum(axis=1)

In [58]:
#calculate the sum of all features for each comment
non_ironic_feature_prediction = non_ironic_predictor_df.sum(axis=1)

#add final column to ironic predictor df with feature totals
non_ironic_predictor_df["Feature Weight"] = non_ironic_predictor_df.sum(axis=1)

In [59]:
#create final df with final predicitons
final_predictordf = pd.DataFrame(non_ironic_feature_prediction)

final_predictordf.columns = ["Non-ironic Feature Result"] + final_predictordf.columns.tolist()[1:]
final_predictordf["Ironic Feature Result"] = ironic_feature_prediction

final_predictordf.head()

Unnamed: 0,Non-ironic Feature Result,Ironic Feature Result
0,66.407978,34.510572
1,41.850927,9.807448
2,107.967435,139.925636
3,60.973168,28.938413
4,24.883003,24.390617


In [60]:
final_prediction = final_predicition_results(final_predictordf)
val["Prediction"] = final_prediction
print(len(val))

195


# (7) Accuracy Score

In [61]:
accuracy = accuracy(val)
print(accuracy)

0.5025641025641026
