In [1]:
#import/install all packages at the top

import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk 
from collections import Counter
from spacy.lang.en.stop_words import STOP_WORDS
import string

In [2]:
#import and read file in df with pandas (for better visualisation)

filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/irony-labeled.csv"

gold_label = pd.read_csv(filename)

In [3]:
#rename the columns
gold_label.columns = ["Comment_Text", "Label"]

# Split into Train (70), Validation (10) and Test (20) sets

-using scikit learn 'train_test_split' function twices gives the validation set

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
y = gold_label["Comment_Text"]
x = gold_label["Label"]


In [6]:
#test and train sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=43)

#split the training set to get validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=43)

In [7]:
#pandas concat joins series together (i.e. dataframes)
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)
val = pd.concat([X_val, y_val], axis=1)

# VALIDATION TESTING WIOOOO


In [8]:
#check format of train df

print(val)

      Label                                       Comment_Text
1836     -1  Cola costs more per litre than petrol around h...
1704     -1  Damn.   I can't believe this book sells.   It'...
1656     -1  My mother-in-law watches way to much cable TV,...
792      -1  Do you know what type of salt was used for hea...
1286     -1  Also, I'm pretty sure surveys have shown that ...
1930     -1  I thought this was even more impressive:\n\n&g...
1639     -1  I think the article made it pretty clear that ...
286      -1  Libertarians are the mirror image of communist...
692      -1             She's a reformist, not a revolutionary
538      -1                    It should have absolutely none.
1648      1                        Can't argue with that logic
140      -1  She would have much more effect as the chairma...
1796     -1  If you think your legislator is too conservati...
860      -1  I love this clip sooo much, because she just n...
995       1                               who?  the dem

In [9]:
# #Convert into 2 dictionaries
val_dict = val.set_index(val.index).T.to_dict()

print(len(val_dict))
print(len(val_dict))

195
195


# FUNCTIONS

In [10]:
def get_all_tokens(test_dict):
    """Take dictionary and return list of comments as spacy docs"""
    comment_list = []
    for comment_index, label in test_dict.items():
        for key in label:
            text = label[key]
            if type(text) == str:
                comment_list.append(nlp(text))
    return comment_list

In [11]:
def get_words(listx):
    """Take a list (already parsed through SpaCy) remove punctuation and return list of word tokens"""
    ir_clean_docs = [] #remove punctuation

    for x in listx:
        clean_list = []
        for y in x:
            if y.pos_ != 'PUNCT':
                clean_list.append(y)
        ir_clean_docs.append(clean_list)
    return ir_clean_docs

In [12]:
def get_punct(listx):
    """Take a list (already parsed through spacy), remove words and return list of punctuation ONLY"""
    ir_punct = [] #only punctuation

    for x in listx:
        clean_list = []
        for y in x:
            if y.pos_ == 'PUNCT':
                clean_list.append(y)
        ir_punct.append(clean_list)
    return ir_punct

In [13]:
def average_word_length(doc):
    """Take doc and return average word length"""
    for token in doc:
        word = token.text
        average_word_length = sum(len(word) for word in doc) / len(doc)
    return(average_word_length)

In [14]:
def average_sent_length(doc):
    """Take doc and return average sentence length"""
    sent_list = []

    for sent in doc.sents:
        len_sent = len(sent)
        sent_list.append(len_sent)

    total = sum(sent_list)
    leng = len(sent_list)

    average_sent_length = total / leng
    return(average_sent_length)

In [15]:
def relative_count_wordtypes(doc):
    """Return relative count average for all word types i.e. nouns, pronouns, verbs etc with word type as key and average as value"""
    pos_tags = []
    for token in doc:
        pos_tags.append(token.pos_)
    counting = Counter(pos_tags) #returns dictionary with whole count for each word type in doc
    
    leng = len(doc) #overall length of doc (no. of tokens)
    new_dict = {}
    
    for key, value in counting.items(): #iterate over entire dict
        new_dict[key] = value/ leng
            
            
    return new_dict

In [16]:
def check_sarcsymbol(comment_list):
    """Take a list of comments (parsed through SpaCy); return list of items if "/s" is present [Reddit "/s" = sarcasm]"""
    sarcsymb = []
    for x in comment_list:
        for y in x:
            if y.text == "/s":
                sarcsymb.append(x)
    return(sarcsymb)


In [17]:
def get_punct_average(punctuation_list, token_comment_list):
    """Take preprocessed list of punctuation and full token list (MUST be of equal length); 
    Returns list of the average for ALL punctuation (based on number overall of tokens)
    for each comment""" 

    punct_count = []
    for comment in punctuation_list:
        punct_count.append(len(comment))

    len_comment = []
    for comment in token_comment_list:
        len_comment.append(len(comment))
    
    punct_count, len_comment = np.array(punct_count), np.array(len_comment) 
    averages = punct_count + len_comment/2
    return averages

In [18]:
def get_indiv_punct(doc):
    """Return relative count average for all word types i.e. nouns, pronouns, verbs etc with word type as key and average as value"""
    punc_tags = []
    for token in doc:
        if token.is_punct:
            punc_tags.append(token)
            
    
    #make each a string so not multiple keys with same vaues
    punc_tags = [str(punc) for punc in punc_tags]
           

    punc_tag_dict = Counter(punc_tags) #returns dictionary with whole count for each word type in doc
    
    leng = len(doc) #overall length of doc (no. of tokens)
    new_dict = {}
    
    for key, value in punc_tag_dict.items(): #iterate over entire dict
        new_dict[key] = value/ leng
            
    final_dict = dict(new_dict)
            
    return final_dict

In [74]:
def count_uppercase(doc):
    """Take nlp doc and return the average number of fully uppercase words for each comment as a list"""
    listd = []
    
    for token in doc:
        if token.is_upper == True:
            listd.append(token)
            
    counting = Counter(listd)
    
    my_dict = dict(counting)
    upper_count_avg = []
    
#     for key, value in my_dict.items():
    x = sum(my_dict.values())
    upper_count_avg.append(x)
#         if key == str:
#             my_dict[key] = sum(values)
    return upper_count_avg

In [75]:
def get_entities(doc):
    """Take nlp doc and return a dictionary with key as ent.labe_ and value as the average number"""
    entity = []
    for token in doc.ents:
        entity.append(token.label_)

    new_dict = Counter(entity)
    leng = len(doc)
    
    for key, value in new_dict.items():
        new_dict[key] = value / leng
        
    ent_dict = dict(new_dict)
    
    return ent_dict



# START THE VAL TESTING

In [76]:
#1) GET ALL TOKENS
tokens = get_all_tokens(val_dict)

In [77]:
#2) Get list of ONLY words (no punct)
word_list = get_words(tokens)

In [78]:
#3) Get list of ONLY punct (no words)
punct_list = get_punct(tokens)

In [79]:
#Create df for total, full returns for irony
total_val= pd.DataFrame({'Comment Parsed':tokens})
total_val["Tokens"] = word_list
total_val["Punctuation"] = punct_list
total_val.head()

Unnamed: 0,Comment Parsed,Tokens,Punctuation
0,"(Cola, costs, more, per, litre, than, petrol, ...","[Cola, costs, more, per, litre, than, petrol, ...",[.]
1,"(Damn, ., , I, ca, n't, believe, this, book,...","[Damn, , I, ca, n't, believe, this, book, se...","[., ., ,, .]"
2,"(My, mother, -, in, -, law, watches, way, to, ...","[My, mother, in, law, watches, way, to, much, ...","[-, -, ,, ,, ,, ., ,, ., ,, "", ,, ,, !, !, "", .]"
3,"(Do, you, know, what, type, of, salt, was, use...","[Do, you, know, what, type, of, salt, was, use...",[?]
4,"(Also, ,, I, 'm, pretty, sure, surveys, have, ...","[Also, I, 'm, pretty, sure, surveys, have, sho...","[,, ., ,, .]"


In [25]:
#4) WORD LENGTH
average_word_list = []
for comment in word_list:
    average_word_list.append(average_word_length(comment))

#Create DataFrame for Summary of Irony STATS
summary= pd.DataFrame({"Average Word Length": average_word_list})

In [26]:
#SENTENCE LENGTH
average_sentence_list = []
for x in tokens:
    average_sentence_list.append(average_sent_length(x))

#Add to Summary of Irony STATS df
summary["Average Sentence Length"] = average_sentence_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length
0,4.555556,10.0
1,4.08,9.666667
2,3.625,27.2
3,3.666667,13.0
4,4.704545,24.0


In [27]:
#/S SYMBOLS

sarcsymb = check_sarcsymbol(tokens)
sarcsymb = (len(sarcsymb))

summary["sarcsymb"] = sarcsymb

In [28]:
#PUNCTUATION RICHNESS
average_punct_list = get_punct_average(punct_list, tokens)

summary["Punctuation Richness"] = average_punct_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length,sarcsymb,Punctuation Richness
0,4.555556,10.0,2,6.0
1,4.08,9.666667,2,18.5
2,3.625,27.2,2,84.0
3,3.666667,13.0,2,7.5
4,4.704545,24.0,2,28.0


In [29]:
#WORD TYPE AVERAGE 

average_wordtype_list = []
for comment in tokens:
    average_wordtype_list.append(relative_count_wordtypes(comment))

summary_wordtypedf = pd.DataFrame(average_wordtype_list)
summary_wordtypedf.head()

Unnamed: 0,ADJ,ADP,ADV,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X
0,0.1,0.2,0.2,,,,0.2,,,,0.1,0.1,,,0.1,
1,,0.068966,0.068966,0.034483,0.068966,0.034483,0.172414,,,0.103448,,0.137931,0.068966,,0.241379,
2,0.073529,0.088235,0.029412,0.036765,0.088235,,0.161765,0.014706,0.022059,0.088235,0.014706,0.117647,0.022059,0.007353,0.227941,0.007353
3,0.076923,0.153846,,,,,0.307692,,,0.076923,,0.076923,,,0.307692,
4,0.0625,0.125,0.083333,0.041667,0.020833,,0.1875,,0.041667,0.0625,,0.083333,0.020833,,0.270833,


In [30]:
#INDIVIDUAL PUNCTUATION AVERAGE

average_indiv_punc_list = []
for x in tokens:
    average_indiv_punc_list.append(get_indiv_punct(x))

summary_indiv_punct = pd.DataFrame(average_indiv_punc_list)
summary_indiv_punct.head()

Unnamed: 0,!,"""",#,%,&,',(,),*,",",...,:,:),;,?,[,],_,–,“,”
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,0.034483,...,,,,,,,,,,
2,0.014706,0.014706,,,,,,,,0.051471,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,0.076923,,,,,,
4,,,,,,,,,,0.041667,...,,,,,,,,,,


In [31]:
#UPPER CASE WORDS (total)

uppercase_list = []
for b in tokens:
    uppercase_list.append((count_uppercase(b)))

summary["Uppercase Average"] = uppercase_list
summary.head()

Unnamed: 0,Average Word Length,Average Sentence Length,sarcsymb,Punctuation Richness,Uppercase Average
0,4.555556,10.0,2,6.0,[0]
1,4.08,9.666667,2,18.5,[1]
2,3.625,27.2,2,84.0,[7]
3,3.666667,13.0,2,7.5,[0]
4,4.704545,24.0,2,28.0,[1]


In [32]:
#GET IR ENTITIES 
named_entity_list = []
for comment in tokens:
    named_entity_list.append(get_entities(comment))
    
summary_named_entity = pd.DataFrame(named_entity_list)

# summary_named_entity.head()  

# Import master general table for comparison

In [82]:
master_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_general.csv"
mastergeneral_df = pd.read_csv(master_filename)
mastergeneral_df.head()

mastergeneral_df.rename(columns={mastergeneral_df.columns[0]: "Class"})

Unnamed: 0,Class,Average Word Length,Average Sentence Length,Number of '/s' symbols,Punctuation Richness,Average Number of Uppercase Words
0,Ironic Comments,4.304183,14.261968,3,18.70155,0.749354
1,Non-ironic Comments,4.41101,15.805244,0,32.517912,1.132037


In [58]:
#summary
#mastergeneral_df

def get_classification(masterdf, newdf, mastercolumnindex_number, newcolumnindexnumber):
    
    ironic_word_avergae = masterdf.iloc[0][mastercolumnindex_number]
    
    non_ironic_word_average = masterdf.iloc[1][mastercolumnindex_number]

    median  = (ironic_word_avergae + non_ironic_word_average)/ 2
    
    row_numb = newdf.count()
    leng = int(row_numb.mean())
    
    #access column ONLY and all rows
    x = list(newdf.iloc[:,newcolumnindexnumber])

    new_list = []
    
    for item in x:
        if item > median:
            new_list.append("-1")
#           "Non-ironic"
            
        elif item < median:
            new_list.append("1")
#             "Ironic"
            
        else:
            new_list.append("0")
#             "Neutral"
        
    return new_list

In [80]:
#get results from classification determiner
average_word_length = get_classification(mastergeneral_df, summary, 1, 0)
average_sent_length= get_classification(mastergeneral_df, summary, 2, 1)
punct_richness = get_classification(mastergeneral_df, summary, 4, 3)
average_uppercase = get_classification(mastergeneral_df, summary, 5, 4)


In [70]:
#VAL is original df with gold labels

#Create PREDICTOR DATAFRAME with classifications

predictor_df = pd.DataFrame(average_word_length)

predictor_df.columns = ['WORD LENGTH'] + predictor_df.columns.tolist()[1:]

predictor_df["SENTENCE LENGTH"] = average_sent_length
predictor_df["PUNCT RICH"] = punct_richness
predictor_df["UPPERCASE"] = average_uppercase
predictor_df.head()

Unnamed: 0,WORD LENGTH,SENTENCE LENGTH,PUNCT RICH,UPPERCASE
0,-1,1,1,1
1,1,1,1,-1
2,1,-1,-1,-1
3,1,1,1,1
4,-1,-1,-1,-1


In [129]:
# predictor_df = predictor_df.astype(int) #convert entire df to integer types
# predictor_df.dtypes #check types

####NEED TO GET COUNTER FOR NUMBER OF -1 AND 1
#if number of -1 is greater than 1, label as -1 overall, vice versa





x = dict(predictor_df.iloc[1].value_counts()) #this gives counter dict FOR ONE ROW

#NEED TO ITERATE OVER EACH ROW IN PREDICTOR_DF AND RETURN A LIST OF ALL RESULTS
### CAN THEN ADD THIS LIST TO VAL DF TO COMPARE




In [111]:
# first_final_predictor_classification = list(predictor_df.sum(axis=1))

# first_final_predictor_classification



#print(first_final_predictor_classification)

# # #summary
# # # #mastergeneral_df

# def get_FINAL_classification(masterdf, newdf, mastercolumnindex_number, newcolumnindexnumber):
    
#     final_predictor_classification = list(predictor_df.sum(axis=1))
    
# #     non_ironic_word_average = masterdf.iloc[1][mastercolumnindex_number]

#     median  = (ironic_word_avergae + non_ironic_word_average)/ 2
    
#     row_numb = newdf.count()
#     leng = int(row_numb.mean())
    
#     #access column ONLY and all rows
#     x = list(newdf.iloc[:,newcolumnindexnumber])

#     new_list = []
    
#     for item in x:
#         if item > median:
#             new_list.append("-1")
# #           "Non-ironic"
            
#         elif item < median:
#             new_list.append("1")
# #             "Ironic"
            
#         else:
#             new_list.append("0")
# #             "Neutral"
        
#     return new_list

# Import wordtype count master for comparison

In [71]:
master_wordtype_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_wordtype.csv"
masterwordtype_df = pd.read_csv(master_filename)
masterwordtype_df.head()

masterwordtype_df.rename(columns={mastergeneral_df.columns[0]: "Class"})

Unnamed: 0,Class,Average Word Length,Average Sentence Length,Number of '/s' symbols,Punctuation Richness,Average Number of Uppercase Words
0,Ironic Comments,4.304183,14.261968,3,18.70155,0.749354
1,Non-ironic Comments,4.41101,15.805244,0,32.517912,1.132037


# Import NER count master for comparison

In [72]:
master_ner_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_namedentity.csv"
masterentity_df = pd.read_csv(master_ner_filename)
masterentity_df.head()

masterentity_df.rename(columns={mastergeneral_df.columns[0]: "Class"})

Unnamed: 0,Class,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,Ironic,0.041123,0.045283,0.025641,0.003165,0.048072,,0.015036,0.039665,0.036168,0.044726,0.033967,0.055027,0.045914,0.047837,0.01114,0.003165,0.03235,0.027053
1,Non-ironic,0.034334,0.023964,0.009453,0.012705,0.029929,0.021429,0.02059,0.019512,0.025125,0.034095,0.017485,0.032427,0.025188,0.03766,0.009576,0.021969,0.01667,0.018885


# Import punctuation type count master for comparison

In [73]:
master_punct_filename= "/Users/laure/OneDrive/Dokumente/VU/Python for Text Analysis/Final Assignment/train_summary_puncttype.csv"
masterpunct_df = pd.read_csv(master_punct_filename)
masterpunct_df.head()

masterpunct_df.rename(columns={mastergeneral_df.columns[0]: "Class"})

Unnamed: 0,Class,!,"""",""")",#,%,&,',(,),...,[,\,],_,§,–,—,‘,“,”
0,Ironic,0.093206,0.063918,0.003165,,0.047312,0.020822,0.070586,0.017297,0.018151,...,0.023683,0.032258,0.032581,0.018987,,0.003165,0.029412,0.003165,0.018484,0.020308
1,Non-ironic,0.084272,0.05079,,0.027027,0.022251,0.020579,0.053885,0.018097,0.021424,...,0.076399,,0.122762,0.011111,0.011765,0.011765,0.009143,0.008478,0.011914,0.01263
