### TEXT PREPROCESSING

In [128]:
#!py -3.8 -m pip install -r requirements.txt

In [129]:
import time
import nltk
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from ast import literal_eval
import re, string
import pandas as pd

In [130]:
def data():
    global language, dataset_path
    
    keyb = input("Choose language: type English, Spanish or Greek.")
    if(keyb.casefold() == "english"):
        language = 'english'
    elif(keyb.casefold() == "spanish"):
        language = 'spanish'
    elif(keyb.casefold() == "greek"):
        language = 'greek'
    else:
        print("Invalid input. Please type English, Spanish or Greek.")
        data()

    dataset_path = "dataset_"+ language +".txt"

    return language, dataset_path

In [131]:
data()

('greek', 'dataset_greek.txt')

In [132]:
df =  pd.read_csv(dataset_path)
tweets = df['tweet'].tolist()

In [133]:
def preprocess_word(w):
    # Removes punctuation
    
    translator = str.maketrans('', '', string.punctuation)
    punctuation = w.translate(translator)

    return punctuation

In [134]:
def preprocessing(x):
    # Returns a nested list of the processed sentences
    
    # Removes mentions, numbers and links
    mentions = [re.sub(r'@\w+',"", sent) for sent in tweets]
    numbers = [re.sub('[0-9]+', "", sent) for sent in mentions]
    links = [re.sub(r'http\S+', "", sent) for sent in numbers]
    emoji = [re.sub("[\U0001F600-\U0001F64F]+", "", sent) for sent in links]
    symbols = [re.sub("[\U0001F300-\U0001F5FF]+", "", sent) for sent in emoji]

    # Removes stopwords
    stop_words = set(stopwords.words(language))
    filtered_sentence = [w for w in symbols if not w.lower() in stop_words]
    
    # Removes lower text, word tokenization
    lower = [[sent.lower()] for sent in filtered_sentence]
    in_list = [word for sent in lower for word in sent]
    word_tokenized = [word_tokenize(sent) for sent in in_list]
    word_tokenized = [sent for sent in word_tokenized if sent]
    
    for _id, sent in enumerate(word_tokenized):
        word_tokenized[_id] =  [preprocess_word(w) for w in sent]
    
    # Removes empty elements, sentences and retweets
    words = [[word for word in sent if word != '' and word != 'rt' and len(word)>1] for sent in word_tokenized]
    sentences = [sent for sent in words if sent]

    return sentences

In [135]:
text = preprocessing(tweets)
print(text[:50])

[['θυμάστε', 'πριν', 'χρόνια', 'που', 'κοιμόμασταν', 'με', 'ανοιχτά', 'παράθυρα', 'επειδή', 'νιώθαμε', 'ασφαλείς', 'τώρα', 'που', 'καταδικάστηκε', 'χρυσή', 'αυ…'], ['για', 'να', 'καταλάβουμε', 'συμφωνεί', 'με', 'αποτέλεσμα', 'πολ', 'που', 'είναι', 'υπέρ', 'του', 'θανάτου', 'του', 'κουφοντινα', 'αλλά', 'σο…'], ['λες', 'να', 'ψηφίζει', 'πολακη'], ['so', 'the', 'cage', 'has', 'finally', 'open'], ['για', 'κιλά', 'χόρτο', 'ισόβια', 'για', 'φόνους', 'αλλοδαπών', 'και', 'αντιφασιστών', 'χρόνια', 'δικαιοσύνη', 'στις', 'δόξες', 'τις'], ['κόσμος', 'σάς', 'ευχαριστεί', 'για', 'τη', 'δροσερή', 'πνοή', 'δημοκρατίας'], ['το', 'πιο', 'ανήθικο', 'από', 'όλα', 'όμως', 'είναι', 'αυτό', 'που', 'έχει', 'συμβεί', 'με', 'τους', 'ρουσφετολογικούς', 'εμβολιασμούς', 'με', 'αποκορύφωμα', 'το', 'πρό…'], ['την', 'κυριακή', 'με', 'την', 'ψήφο', 'μας', 'υπογράφουμε', 'μια', 'νέα', 'συμφωνία', 'συμφωνία', 'αλήθειας', 'ενότητας', 'προόδου', 'με', 'ισχυρή', 'ανάπτυξη…'], ['πατήρ', 'επικράτησε', 'του', 'υιού', 'με', 'ε

### BAG OF WORDS 

In [136]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from ast import literal_eval

try: 
    assert(literal_eval(str(text)) == text.copy())
except AssertionError:
    print('failed to convert')
    
final_str = ([" ".join(x) for x in text])

count_vect = CountVectorizer()
bow = count_vect.fit_transform(final_str).toarray()
print(bow[:50])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [137]:
print(type(bow))
print(type(bow[0]))
print(type(bow[0][0]))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.int64'>


In [138]:
vocab = count_vect.get_feature_names()

### EMBEDDINGS 

In [None]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import numpy as np

model = Word2Vec(sentences=text, window=5, min_count=1, workers=4)
model.save("word2vec.model")

embeddings = [model.wv[word] for word in text]

Calculate the word vector average for every sentence:

In [None]:
v_average = []
for i in text:
    av = np.mean(model.wv[i], axis=0)
    v_average.append(av)

### SYNTAX

In [None]:
def flatten_list(x):
#Takes a nested list and converts it into a list of elements
#where every sublist is a new element

    new_list = [] 
    
    for sent in x:
        sentences = " ".join(sent)
        new_list.append(sentences)
    
    return new_list

new = flatten_list(text)
print(new[:50])

['θυμάστε πριν χρόνια που κοιμόμασταν με ανοιχτά παράθυρα επειδή νιώθαμε ασφαλείς τώρα που καταδικάστηκε χρυσή αυ…', 'για να καταλάβουμε συμφωνεί με αποτέλεσμα πολ που είναι υπέρ του θανάτου του κουφοντινα αλλά σο…', 'λες να ψηφίζει πολακη', 'so the cage has finally open', 'για κιλά χόρτο ισόβια για φόνους αλλοδαπών και αντιφασιστών χρόνια δικαιοσύνη στις δόξες τις', 'κόσμος σάς ευχαριστεί για τη δροσερή πνοή δημοκρατίας', 'το πιο ανήθικο από όλα όμως είναι αυτό που έχει συμβεί με τους ρουσφετολογικούς εμβολιασμούς με αποκορύφωμα το πρό…', 'την κυριακή με την ψήφο μας υπογράφουμε μια νέα συμφωνία συμφωνία αλήθειας ενότητας προόδου με ισχυρή ανάπτυξη…', 'πατήρ επικράτησε του υιού με ευχαριστώ για την συμμετοχή σας', 'αυτό που προσπαθώ να σου πω είναι ότι αυτό το κράτος δε θελει να σωθεί', 'εσείς σήμερα καλυβεστε θα μείνετε ξεκαλυβωτοι', 'περιμένω', 'κατατίθεται στη βουλή αντικαπνιστικός νόμος ένα στοίχημα που πρέπει να κερδηθεί σε συμμαχία με όλους τους πολίτε…', 'κακώς συντηρείτε τον ι

In [None]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import spacy
import string
import pprint
%matplotlib inline

In [None]:
#Append every word to a wordset
wordset = set()
for sentence in text:
    for word in sentence:
        wordset.add(word)

In [None]:
print(wordset)

{'πόζες', 'πήγα', 'επιχειρήσεις', 'τον…', 'προσλήψεις', 'τωρα', 'πρόσληψη', 'πράγμα', 'κακούργοι', 'επέκταση', 'επεισόδιο', 'εμάς', 'θυμήσου', 'καταφέρεται', 'ευθύνες', 'εκλογές', 'βάλουν', 'ψυχολογική', 'ανακατευτηκε', 'τούρκο', 'μίας', 'τοισλαμε…', 'ποδοσφαιρ…', 'εννοούν', 'πό…', 'αρμόδιος', 'αμηχανία', 'λαθραιους', 'εγγυημένο', 'κακοποιών', 'ολα', 'τεχνολογία', 'έδειρε', 'φορές', 'τριημέρου', 'παππούδες', 'μωρομάνες', 'πλησιαζει', 'σελεμπριτι', 'λινού', 'πληγείσες', 'ην', 'κρυο', 'είναι', 'κανάλι', 'βλακώδεις', 'κόμματα', 'πείσουμε', 'i…', 'μπροστά', 'δίπλωμα', 'αγαπητέ', 'οποτε', 'έπεσε', 'νόμιμους', 'shut', 'κανω', 'λαθρομεταναστών', 'φασιστα', 'βιώνουν', 'λαθρομεταναστες', 'διορισει', 'ξέρετε', 'κλειδί', 'ισλαμοναζι', 'πλιατσικολόγος', 'coups', 'ρίξε', 'κοπτεται', 'σημερινής', 'συντάκτη', 'δημοκρατική', 'απελασεις', 'τζιχαντιστές', 'συμπεριφορές', 'ροχαλα', 'μεγαλύτερες', 'λιβαδειά', 'μπόχα', 'ενόπλους', 'υπέγραψε', 'βαρέσουν', 'προγράμματός', 'γαμώ', 'nea', 'περνούν', 'δημοκρατί

In [None]:
#Add every word of the dataset as a node
base_graph = nx.Graph()
base_graph.add_nodes_from(wordset)

In [None]:
rep = {}
processed_sentences = []

if language == 'english':
    nlp = spacy.load("en_core_web_sm")
elif language == 'spanish':
    nlp = spacy.load("es_core_news_sm")
elif language == 'greek':
    nlp = spacy.load("el_core_news_sm")

timestamps1 = []

start_time1 = time.time()
for sent_id, sent in enumerate(new[:40]):
    sentence_graph = base_graph.copy()
    processed_sentences.append(nlp(sent))
    if sent_id % 5 == 0:
        timestamps1.append(time.time() - start_time1)

In [None]:
index = []
for i in range(5, len(text)):
    if i % 5 == 0:
        index.append(i)
    else:
        continue

for x,y in zip(index, timestamps1):
    print("Creating graphs: ", x, " sentences in ", y, "seconds")

Creating graphs:  5  sentences in  0.6060085296630859 seconds
Creating graphs:  10  sentences in  2.038012742996216 seconds
Creating graphs:  15  sentences in  3.4740099906921387 seconds
Creating graphs:  20  sentences in  4.806994676589966 seconds
Creating graphs:  25  sentences in  6.606042146682739 seconds
Creating graphs:  30  sentences in  8.245002031326294 seconds
Creating graphs:  35  sentences in  10.181997060775757 seconds
Creating graphs:  40  sentences in  12.081008195877075 seconds


In [None]:
#Add edges between the nodes according to syntactic relations
start_time2 = time.time()
timestamps2 = []
for sent_id, sent in enumerate(processed_sentences[:40]):
    for token in sent:
        nodeA = token.text
        nodeB = token.head.text
        sentence_graph.add_edge(nodeA, nodeB)
        sentence_representation =  nx.adjacency_matrix(sentence_graph) #sparse matrix
        rep[sent_id] = sentence_representation.toarray()
    if sent_id % 5 == 0:
        timestamps2.append(time.time() - start_time2)

MemoryError: Unable to allocate 222. MiB for an array with shape (7637, 7637) and data type int32

In [None]:
for x,y in zip(index, timestamps2):
    print("Adding edges: ", x, " sentences in ", y, "seconds")

Adding edges:  5  sentences in  0.0779874324798584 seconds
Adding edges:  10  sentences in  0.3469836711883545 seconds
Adding edges:  15  sentences in  0.558997392654419 seconds
Adding edges:  20  sentences in  0.706977367401123 seconds
Adding edges:  25  sentences in  0.8059976100921631 seconds
Adding edges:  30  sentences in  1.0039756298065186 seconds
Adding edges:  35  sentences in  1.2319750785827637 seconds


In [None]:
rep.values()

dict_values([])

In [None]:
#Flatten the sentence representation array
flat_list = []
key_order = sorted(rep.keys())

for key in key_order:
    sentence = np.array([item for sublist in rep[key] for item in sublist])
    flat_list.append(sentence)

synt = np.asarray(lst)

46225 215 315 21


In [None]:
# for sentence_id, sentence in enumerate(rep.values()):
    #arr.append([item for item in outer_list for inner_list in outer_list])
    
# The above array doesn't necessarily keep the sentences' order
# A dictionary should be used instead
#
# arr = {}
# for sent_id, sent in enumerate(rep.values()):
#    ...
#     arr[sent_id] = 

In [None]:
# store the key order:
# key_order = sorted(rep.keys())
# every for loop in a sentence_id : some_value dictionary :
# for key in key_order:
#    value = rep[key]
    # do stuff 
# print(value)

In [None]:
# options = {
#    "font_size": 20,
#    "node_size": 30,
#    "node_color": "white",
#    "edgecolors": 'blue',
#    "linewidths": 1,
#    "width": 1,
#
# plt.figure(3,figsize=(33,33))
# nx.draw(sentence_graph, with_labels=True, **options)

### CLASSIFICATION

#### Classification using Bag-of-Words:

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC

In [None]:
x = bow
y = df['class'].astype(int)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
logr = LogisticRegression()
logr.fit(x_train, y_train)
bow_predictions = logr.predict(x_test)
print(bow_predictions)

[1 1 1 1 1 1 1 1 1 1]


In [None]:
bow_report = classification_report(y_test, bow_predictions)
print(bow_report)

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        10

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



#### Classification using embeddings:

In [None]:
v_train, v_test, y_train, y_test = train_test_split(v_average, y, test_size=0.25, random_state=0)
logr.fit(v_train, y_train)
emb_predictions = logr.predict(v_test)
print(emb_predictions)

[1 1 1 1 1 1 1 1 1 1]


In [None]:
emb_report = classification_report(y_test, emb_predictions)
print(emb_report)

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        10

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



#### Classification using both Bag-of-Words and embeddings: 

In [None]:
conc = np.concatenate([bow, v_average], axis=1)

In [None]:
c_train, c_test, y_train, y_test = train_test_split(conc, y, test_size=0.25, random_state=0)
logr.fit(c_train, y_train)
bow_emb_predictions = logr.predict(c_test)
print(bow_emb_predictions)

[1 1 1 1 1 1 1 1 1 1]


In [None]:
bow_emb_report = classification_report(y_test, bow_emb_predictions)
print(bow_emb_report)

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        10

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



#### Classification using syntax

In [None]:
g_train, g_test, y_train, y_test = train_test_split(synt, y, test_size=0.25, random_state=0)
logr.fit(g_train, y_train) 
syntax_predictions = logr.predict(g_test)
#print(syntax_predictions) 

ValueError: setting an array element with a sequence.

In [None]:
syntax_report = classification_report(y_test, syntax_predictions)
print(syntax_report)

In [None]:
# if laptop dies, use some dimensionality reduction method (eg PCA)