In [1]:
#import/install all packages at the top

import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk 
from collections import Counter
from spacy.lang.en.stop_words import STOP_WORDS
import string

In [2]:
#import and read file in df with pandas (for better visualisation)

filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/irony-labeled.csv"

gold_label = pd.read_csv(filename)

In [3]:
#rename the columns
gold_label.columns = ["Comment_Text", "Label"]

# Split into Train (70), Validation (10) and Test (20) sets

-using scikit learn 'train_test_split' function twices gives the validation set

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
y = gold_label["Comment_Text"]
x = gold_label["Label"]


In [6]:
#test and train sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=43)

#split the training set to get validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=43)

In [7]:
#pandas concat joins series together (i.e. dataframes)
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)
val = pd.concat([X_val, y_val], axis=1)

# VALIDATION TESTING WIOOOO


In [8]:
#check format of train df

print(val)

      Label                                       Comment_Text
1836     -1  Cola costs more per litre than petrol around h...
1704     -1  Damn.   I can't believe this book sells.   It'...
1656     -1  My mother-in-law watches way to much cable TV,...
792      -1  Do you know what type of salt was used for hea...
1286     -1  Also, I'm pretty sure surveys have shown that ...
1930     -1  I thought this was even more impressive:\n\n&g...
1639     -1  I think the article made it pretty clear that ...
286      -1  Libertarians are the mirror image of communist...
692      -1             She's a reformist, not a revolutionary
538      -1                    It should have absolutely none.
1648      1                        Can't argue with that logic
140      -1  She would have much more effect as the chairma...
1796     -1  If you think your legislator is too conservati...
860      -1  I love this clip sooo much, because she just n...
995       1                               who?  the dem

In [9]:
##just so we can get our numbers ie how many of each
ironic_val = val[val["Label"] == 1]
nonironic_val = val[val["Label"] == -1]

print(f"Training data contains {len(ironic_val)} IRONIC comments")
print(f"Training data contains {len(nonironic_val)} NON- IRONIC comments")

Training data contains 47 IRONIC comments
Training data contains 148 NON- IRONIC comments


In [10]:
# #Convert into 2 dictionaries
val_dict = val.set_index(val.index).T.to_dict()

print(len(val_dict))
print(len(val_dict))

195
195


# FUNCTIONS

In [11]:
def get_all_tokens(test_dict):
    """Take dictionary and return list of comments as spacy docs"""
    comment_list = []
    for comment_index, label in test_dict.items():
        for key in label:
            text = label[key]
            if type(text) == str:
                comment_list.append(nlp(text))
    return comment_list

In [12]:
def get_words(listx):
    """Take a list (already parsed through SpaCy) remove punctuation and return list of word tokens"""
    ir_clean_docs = [] #remove punctuation

    for x in listx:
        clean_list = []
        for y in x:
            if y.pos_ != 'PUNCT':
                clean_list.append(y)
        ir_clean_docs.append(clean_list)
    return ir_clean_docs

In [13]:
def get_punct(listx):
    """Take a list (already parsed through spacy), remove words and return list of punctuation ONLY"""
    ir_punct = [] #only punctuation

    for x in listx:
        clean_list = []
        for y in x:
            if y.pos_ == 'PUNCT':
                clean_list.append(y)
        ir_punct.append(clean_list)
    return ir_punct

In [14]:
def average_word_length(doc):
    """Take doc and return average word length"""
    for token in doc:
        word = token.text
        average_word_length = sum(len(word) for word in doc) / len(doc)
    return(average_word_length)

In [15]:
def average_sent_length(doc):
    """Take doc and return average sentence length"""
    sent_list = []

    for sent in doc.sents:
        len_sent = len(sent)
        sent_list.append(len_sent)

    total = sum(sent_list)
    leng = len(sent_list)

    average_sent_length = total / leng
    return(average_sent_length)

In [16]:
def relative_count_wordtypes(doc):
    """Return relative count average for all word types i.e. nouns, pronouns, verbs etc with word type as key and average as value"""
    pos_tags = []
    for token in doc:
        pos_tags.append(token.pos_)
    counting = Counter(pos_tags) #returns dictionary with whole count for each word type in doc
    
    leng = len(doc) #overall length of doc (no. of tokens)
    new_dict = {}
    
    for key, value in counting.items(): #iterate over entire dict
        new_dict[key] = value/ leng
            
            
    return new_dict

In [17]:
def check_sarcsymbol(comment_list):
    """Take a list of comments (parsed through SpaCy); return list of items if "/s" is present [Reddit "/s" = sarcasm]"""
    sarcsymb = []
    for x in comment_list:
        for y in x:
            if y.text == "/s":
                sarcsymb.append(x)
    return(sarcsymb)


In [18]:
def get_punct_average(punctuation_list, token_comment_list):
    """Take preprocessed list of punctuation and full token list (MUST be of equal length); 
    Returns list of the average for ALL punctuation (based on number overall of tokens)
    for each comment""" 

    punct_count = []
    for comment in punctuation_list:
        punct_count.append(len(comment))

    len_comment = []
    for comment in token_comment_list:
        len_comment.append(len(comment))
    
    punct_count, len_comment = np.array(punct_count), np.array(len_comment) 
    averages = punct_count + len_comment/2
    return averages

In [19]:
def get_indiv_punct(doc):
    """Return relative count average for all word types i.e. nouns, pronouns, verbs etc with word type as key and average as value"""
    punc_tags = []
    for token in doc:
        if token.is_punct:
            punc_tags.append(token)
            
    
    #make each a string so not multiple keys with same vaues
    punc_tags = [str(punc) for punc in punc_tags]
           

    punc_tag_dict = Counter(punc_tags) #returns dictionary with whole count for each word type in doc
    
    leng = len(doc) #overall length of doc (no. of tokens)
    new_dict = {}
    
    for key, value in punc_tag_dict.items(): #iterate over entire dict
        new_dict[key] = value/ leng
            
    final_dict = dict(new_dict)
            
    return final_dict

In [20]:
def count_uppercase(doc):
    """Take nlp doc and return the average number of fully uppercase words for each comment as a list"""
    listd = []
    
    for token in doc:
        if token.is_upper == True:
            listd.append(token)
            
    counting = Counter(listd)
    
    my_dict = dict(counting)
    upper_count_avg = []
    
#     for key, value in my_dict.items():
    x = sum(my_dict.values())
    upper_count_avg.append(x)
#         if key == str:
#             my_dict[key] = sum(values)
    return upper_count_avg

In [21]:
def get_entities(doc):
    """Take nlp doc and return a dictionary with key as ent.labe_ and value as the average number"""
    entity = []
    for token in doc.ents:
        entity.append(token.label_)

    new_dict = Counter(entity)
    leng = len(doc)
    
    for key, value in new_dict.items():
        new_dict[key] = value / leng
        
    ent_dict = dict(new_dict)
    
    return ent_dict



# START THE VAL TESTING

In [22]:
#1) GET ALL TOKENS
tokens = get_all_tokens(val_dict)

In [23]:
#2) Get list of ONLY words (no punct)
word_list = get_words(tokens)

In [24]:
#3) Get list of ONLY punct (no words)
punct_list = get_punct(tokens)

In [25]:
#Create df for total, full returns for irony
total_val= pd.DataFrame({'Comment Parsed':tokens})
total_val["Tokens"] = word_list
total_val["Punctuation"] = punct_list
total_val.head()

Unnamed: 0,Comment Parsed,Tokens,Punctuation
0,"(Cola, costs, more, per, litre, than, petrol, ...","[Cola, costs, more, per, litre, than, petrol, ...",[.]
1,"(Damn, ., , I, ca, n't, believe, this, book,...","[Damn, , I, ca, n't, believe, this, book, se...","[., ., ,, .]"
2,"(My, mother, -, in, -, law, watches, way, to, ...","[My, mother, in, law, watches, way, to, much, ...","[-, -, ,, ,, ,, ., ,, ., ,, "", ,, ,, !, !, "", .]"
3,"(Do, you, know, what, type, of, salt, was, use...","[Do, you, know, what, type, of, salt, was, use...",[?]
4,"(Also, ,, I, 'm, pretty, sure, surveys, have, ...","[Also, I, 'm, pretty, sure, surveys, have, sho...","[,, ., ,, .]"


In [26]:
#4) WORD LENGTH
average_word_list = []
for comment in word_list:
    average_word_list.append(average_word_length(comment))

#Create DataFrame for Summary of Irony STATS
summary= pd.DataFrame({"Average Word Length": average_word_list})

In [27]:
#SENTENCE LENGTH
average_sentence_list = []
for x in tokens:
    average_sentence_list.append(average_sent_length(x))

#Add to Summary of Irony STATS df
summary["Average Sentence Length"] = average_sentence_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length
0,4.555556,10.0
1,4.08,9.666667
2,3.625,27.2
3,3.666667,13.0
4,4.704545,24.0


In [28]:
#/S SYMBOLS

sarcsymb = check_sarcsymbol(tokens)
sarcsymb = (len(sarcsymb))

summary["sarcsymb"] = sarcsymb

In [29]:
#PUNCTUATION RICHNESS
average_punct_list = get_punct_average(punct_list, tokens)

summary["Punctuation Richness"] = average_punct_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length,sarcsymb,Punctuation Richness
0,4.555556,10.0,2,6.0
1,4.08,9.666667,2,18.5
2,3.625,27.2,2,84.0
3,3.666667,13.0,2,7.5
4,4.704545,24.0,2,28.0


In [30]:
#WORD TYPE AVERAGE 

average_wordtype_list = []
for comment in tokens:
    average_wordtype_list.append(relative_count_wordtypes(comment))

summary_wordtypedf = pd.DataFrame(average_wordtype_list)
summary_wordtypedf.head()

Unnamed: 0,ADJ,ADP,ADV,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X
0,0.1,0.2,0.2,,,,0.2,,,,0.1,0.1,,,0.1,
1,,0.068966,0.068966,0.034483,0.068966,0.034483,0.172414,,,0.103448,,0.137931,0.068966,,0.241379,
2,0.073529,0.088235,0.029412,0.036765,0.088235,,0.161765,0.014706,0.022059,0.088235,0.014706,0.117647,0.022059,0.007353,0.227941,0.007353
3,0.076923,0.153846,,,,,0.307692,,,0.076923,,0.076923,,,0.307692,
4,0.0625,0.125,0.083333,0.041667,0.020833,,0.1875,,0.041667,0.0625,,0.083333,0.020833,,0.270833,


In [31]:
#INDIVIDUAL PUNCTUATION AVERAGE

average_indiv_punc_list = []
for x in tokens:
    average_indiv_punc_list.append(get_indiv_punct(x))

summary_indiv_punct = pd.DataFrame(average_indiv_punc_list)
summary_indiv_punct.head()

Unnamed: 0,!,"""",#,%,&,',(,),*,",",...,:,:),;,?,[,],_,–,“,”
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,0.034483,...,,,,,,,,,,
2,0.014706,0.014706,,,,,,,,0.051471,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,0.076923,,,,,,
4,,,,,,,,,,0.041667,...,,,,,,,,,,


In [32]:
#UPPER CASE WORDS (total)

uppercase_list = []
for b in tokens:
    uppercase_list.append((count_uppercase(b)))

summary["Uppercase Average"] = uppercase_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length,sarcsymb,Punctuation Richness,Uppercase Average
0,4.555556,10.0,2,6.0,[0]
1,4.08,9.666667,2,18.5,[1]
2,3.625,27.2,2,84.0,[7]
3,3.666667,13.0,2,7.5,[0]
4,4.704545,24.0,2,28.0,[1]


In [33]:
#GET IR ENTITIES 
named_entity_list = []
for comment in tokens:
    named_entity_list.append(get_entities(comment))
    
summary_named_entity = pd.DataFrame(named_entity_list)

# summary_named_entity.head()  

# Import master general table for comparison

In [34]:
master_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_general.csv"
mastergeneral_df = pd.read_csv(master_filename)
mastergeneral_df.head()

mastergeneral_df.rename(columns={mastergeneral_df.columns[0]: "Class"})

Unnamed: 0,Class,Average Word Length,Average Sentence Length,Number of '/s' symbols,Punctuation Richness,Average Number of Uppercase Words
0,Ironic Comments,4.304183,14.261968,3,18.70155,0.749354
1,Non-ironic Comments,4.41101,15.805244,0,32.517912,1.132037


In [35]:
#difference between average word master and new

In [36]:
#summary
#mastergeneral_df

def get_classification_ironic(masterdf, newdf, mastercolumnindex_number, newcolumnindexnumber, weight):
    
    ironic_word_avergae = masterdf.iloc[0][mastercolumnindex_number]
      
    #access column ONLY and all rows
    x = list(newdf.iloc[:,newcolumnindexnumber])

    new_list = []
    
    for item in x:
        new_list.append(abs(ironic_word_avergae - item)*weight)

        
    return new_list

In [37]:
#get results from classification determiner
ironic_average_word_length = get_classification_ironic(mastergeneral_df, summary, 1, 0, 1)
ironic_average_sent_length= get_classification_ironic(mastergeneral_df, summary, 2, 1, 1)
ironic_punct_richness = get_classification_ironic(mastergeneral_df, summary, 4, 3, 1)
ironic_average_uppercase = get_classification_ironic(mastergeneral_df, summary, 5, 4, 1)


In [38]:
#Create PREDICTOR DATAFRAME with classifications

ironic_predictor_df = pd.DataFrame(ironic_average_word_length)

ironic_predictor_df.columns = ['WORD LENGTH'] + ironic_predictor_df.columns.tolist()[1:]

ironic_predictor_df["SENTENCE LENGTH"] = ironic_average_sent_length
ironic_predictor_df["PUNCT RICH"] = ironic_punct_richness
ironic_predictor_df["UPPERCASE"] = ironic_average_uppercase

ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,UPPERCASE
0,0.251373,4.261968,12.70155,[0.7493540051679587]
1,0.224183,4.595301,0.20155,[0.2506459948320413]
2,0.679183,12.938032,65.29845,[6.250645994832041]
3,0.637516,1.261968,11.20155,[0.7493540051679587]
4,0.400363,9.738032,9.29845,[0.2506459948320413]


In [39]:
ironic_feature_prediction = ironic_predictor_df.sum(axis=1)

In [40]:
ironic_predictor_df["Feature Weight"] = ironic_predictor_df.sum(axis=1)
ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,UPPERCASE,Feature Weight
0,0.251373,4.261968,12.70155,[0.7493540051679587],17.964245
1,0.224183,4.595301,0.20155,[0.2506459948320413],5.27168
2,0.679183,12.938032,65.29845,[6.250645994832041],85.16631
3,0.637516,1.261968,11.20155,[0.7493540051679587],13.850388
4,0.400363,9.738032,9.29845,[0.2506459948320413],19.687491


In [41]:
def get_classification_non_ironic(masterdf, newdf, mastercolumnindex_number, newcolumnindexnumber, weight):
    
    non_ironic_word_avergae = masterdf.iloc[1][mastercolumnindex_number]
      
    #access column ONLY and all rows
    x = list(newdf.iloc[:,newcolumnindexnumber])

    new_list = []
    
    for item in x:
        new_list.append(abs(non_ironic_word_avergae - item)*weight)

        
    return new_list

In [42]:
#get results from classification determiner
non_ironic_average_word_length = get_classification_non_ironic(mastergeneral_df, summary, 1, 0, 1)
non_ironic_average_sent_length= get_classification_non_ironic(mastergeneral_df, summary, 2, 1, 1)
non_ironic_punct_richness = get_classification_non_ironic(mastergeneral_df, summary, 4, 3, 1)
non_ironic_average_uppercase = get_classification_non_ironic(mastergeneral_df, summary, 5, 4, 1)

ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,UPPERCASE,Feature Weight
0,0.251373,4.261968,12.70155,[0.7493540051679587],17.964245
1,0.224183,4.595301,0.20155,[0.2506459948320413],5.27168
2,0.679183,12.938032,65.29845,[6.250645994832041],85.16631
3,0.637516,1.261968,11.20155,[0.7493540051679587],13.850388
4,0.400363,9.738032,9.29845,[0.2506459948320413],19.687491


In [43]:
#Create PREDICTOR DATAFRAME with classifications

non_ironic_predictor_df = pd.DataFrame(non_ironic_average_word_length)

non_ironic_predictor_df.columns = ['WORD LENGTH'] + non_ironic_predictor_df.columns.tolist()[1:]

non_ironic_predictor_df["SENTENCE LENGTH"] = non_ironic_average_sent_length
non_ironic_predictor_df["PUNCT RICH"] = non_ironic_punct_richness
non_ironic_predictor_df["UPPERCASE"] = non_ironic_average_uppercase
non_ironic_predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,UPPERCASE
0,0.144545,5.805244,26.517912,[1.1320368474923237]
1,0.33101,6.138577,14.017912,[0.1320368474923237]
2,0.78601,11.394756,51.482088,[5.867963152507676]
3,0.744344,2.805244,25.017912,[1.1320368474923237]
4,0.293535,8.194756,4.517912,[0.1320368474923237]


In [44]:
non_ironic_feature_prediction = non_ironic_predictor_df.sum(axis=1)

In [45]:
non_ironic_predictor_df["Feature Weight"] = non_ironic_predictor_df.sum(axis=1)

In [46]:
#create final df with final predicitons
final_predictordf = pd.DataFrame(non_ironic_feature_prediction)

final_predictordf.columns = ["Non-ironic Feature Result"] + final_predictordf.columns.tolist()[1:]
final_predictordf["Ironic Feature Result"] = ironic_feature_prediction

final_predictordf.head()

Unnamed: 0,Non-ironic Feature Result,Ironic Feature Result
0,33.599738,17.964245
1,20.619536,5.27168
2,69.530817,85.16631
3,29.699536,13.850388
4,13.13824,19.687491


In [47]:
#return list of ïronic vs non-ironic strings from column with lowest number


def final_predicition_results(feature_resultdf):
    
    list_of_tuple_results = [tuple(x) for x in feature_resultdf.to_records(index=False)]
    
    prediciton_list = []
    
    for tup in list_of_tuple_results:
        non_ironic, ironic = tup
    
        if non_ironic > ironic:
            prediciton_list.append("1") #ironic
                
        elif non_ironic < ironic:
            prediciton_list.append("-1") #non-ironic
    
    
    
    return prediciton_list 

In [48]:
final_prediction = final_predicition_results(final_predictordf)
val["Prediction"] = final_prediction
print(len(val))

195


In [49]:
def accuracy(testdf):
    
    label = list(testdf.iloc[:,0])
    prediciton = list(testdf.iloc[:,2])
    list_of_tuple_evaluations = list(zip(label, prediciton))
    
    gold_match = []
    no_match = []
    
    for tup in list_of_tuple_evaluations:
        label, prediction = tup
        
        if label == int(prediction):
            gold_match.append("True") #gold label match
                
        elif label != int(prediction):
            no_match.append("False")
            
    total_leng = len(testdf)
    
    gold_leng = len(gold_match)
    
    accuracy = gold_leng / total_leng  
    
    
    
    
    
    return accuracy
    

In [52]:
accuracy = accuracy(val)
print(accuracy)

0.517948717948718


# Import wordtype count master for comparison

In [None]:
master_wordtype_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_wordtype.csv"
masterwordtype_df = pd.read_csv(master_filename)
masterwordtype_df.head()

masterwordtype_df.rename(columns={mastergeneral_df.columns[0]: "Class"})

# Import NER count master for comparison

In [None]:
master_ner_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_namedentity.csv"
masterentity_df = pd.read_csv(master_ner_filename)
masterentity_df.head()

masterentity_df.rename(columns={mastergeneral_df.columns[0]: "Class"})

# Import punctuation type count master for comparison

In [None]:
master_punct_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_puncttype.csv"
masterpunct_df = pd.read_csv(master_punct_filename)
masterpunct_df.head()

masterpunct_df.rename(columns={mastergeneral_df.columns[0]: "Class"})