### TEXT PREPROCESSING

In [None]:
import nltk, re, string
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer 
from ast import literal_eval

txt = open('dataset.txt')
raw = txt.read()


def preprocess_word(w):
    # Removes punctuation
    translator = str.maketrans('', '', string.punctuation)
    punctuation = w.translate(translator)

    return punctuation


def preprocessing(x):
    # Returns a nested list of the processed sentences
    
    mentions = re.sub(r'@\w+',"", x) #removes mentions
    numbers = re.sub('[0-9]+', "", mentions) #removes numbers
    links = re.sub(r'http\S+', "", numbers) #removes links
    
    sentence_tokenized = sent_tokenize(links) #sentence tokenization - INACCURATE!!!
    
    lower = [[sent.lower()] for sent in sentence_tokenized] #lower text
     
    in_list = [word for sent in lower for word in sent]

    word_tokenized = [word_tokenize(sent) for sent in in_list]
    word_tokenized = [sent for sent in word_tokenized if sent] #word tokenization
    
    for _id, sent in enumerate(word_tokenized):
        word_tokenized[_id] =  [preprocess_word(w) for w in sent]
    
    words = [[word for word in sent if word != '' and word != 'rt'] for sent in word_tokenized] #removes useless words
    sentences = [sent for sent in words if sent] #removes empty sentences
    
    
    return sentences


text = preprocessing(raw)
print(text)


    
#lemmatization
#quotes = punctuation.replace("'","").replace("\"","")

#df = pd.DataFrame(text)
#writer = pd.ExcelWriter('test.xlsx', engine='xlsxwriter')
#df.to_excel(writer, sheet_name='welcome', index=False)
#writer.save()

### BAG OF WORDS 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from ast import literal_eval

try:
    assert(literal_eval(str(text)) == text.copy())
except AssertionError:
    print('failed to convert')
    
final_str = [" ".join(x) for x in text]

train, test = train_test_split(final_str, test_size=0.2)

count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train)
print(train_counts)

### EMBEDDINGS 

In [None]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

model = Word2Vec(sentences=text, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

vector = model.wv['bitch'] #returns numpy vector of a word

sims = model.wv.most_similar('bitch', topn=10) #returns similar words

print(vector)
print(sims)

In [None]:
from sklearn.decomposition import IncrementalPCA  
from sklearn.manifold import TSNE                 
import numpy as np                                  


def reduce_dimensions(model):
    num_dimensions = 2 


    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key) 

    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(autosize=True)
    plt.scatter(x_vals, y_vals)

    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)

### DEPENDENCIES

(1) Get POS tags for every sentence in text <br>
(2) Parse every sentence according to its POS tags <br>
(3) Create the graph for every syntax tree <br>
(4) Convert the graph into its adjacency matrix <br>
(5) Simplify the adjacency matrix to a vector <br>
(6) Use vectors to train the classifier <br>

#### 1. POS Tagging

In [None]:
def flatten_list(x):
#Takes a nested list and converts it into a list of elements
#where every sublist is a new element

    new_list = [] 
    
    for sent in text:
        sentences = " ".join(sent)
        new_list.append(sentences)
    
    return new_list

new = flatten_list(text)

In [34]:
import spacy
from spacy import displacy
from spacy.symbols import nsubj, VERB


def pos_tags(x):
#Returns a nested list of POS tags
    
    nlp = spacy.load("en_core_web_sm")
    tokens = []
    for sent in x:
        doc = nlp(sent)    
        toks = [token.pos_, for token in doc]      
        tokens.append(toks)
    
    return tokens


tags = pos_tags(new)
print(tags)

SyntaxError: invalid syntax (<ipython-input-34-81bed9c672fa>, line 13)

#### 2. Dependency Parsing

In [38]:
def dependency_parsing(x):
# Returns a nested list of syntactic labels

    nlp = spacy.load("en_core_web_sm")
    dependencies = []
    for sent in x:
        doc = nlp(sent)
        new_list = [token.dep_ for token in doc]
        dependencies.append(new_list)
        
    return dependencies

dependency_parsing(new)


[['advmod',
  'nsubj',
  'ROOT',
  'xcomp',
  'poss',
  'compound',
  'dobj',
  'ccomp',
  'ccomp',
  'dobj',
  'oprd'],
 ['npadvmod',
  'det',
  'amod',
  'compound',
  'compound',
  'nsubj',
  'ROOT',
  'dative',
  'dobj',
  'nsubj',
  'aux',
  'advmod',
  'ccomp',
  'poss',
  'attr',
  'nsubj',
  'advmod',
  'parataxis',
  'dobj',
  'prep',
  'pobj',
  'cc',
  'advmod',
  'acomp',
  'det',
  'nsubj',
  'conj',
  'prep',
  'nsubj',
  'pcomp',
  'prep',
  'pobj'],
 ['nsubj',
  'ROOT',
  'prep',
  'pobj',
  'cc',
  'conj',
  'nsubj',
  'parataxis',
  'prep',
  'pobj'],
 ['amod',
  'amod',
  'nsubj',
  'ROOT',
  'nmod',
  'nsubj',
  'ccomp',
  'nmod',
  'cc',
  'conj',
  'nsubj',
  'ccomp',
  'acomp',
  'nsubj',
  'ccomp',
  'det',
  'compound',
  'dobj'],
 ['nsubj',
  'advmod',
  'det',
  'compound',
  'ROOT',
  'advmod',
  'dobj',
  'aux',
  'nsubj',
  'advcl',
  'mark',
  'nsubj',
  'advcl'],
 ['amod', 'ROOT', 'acl', 'agent', 'amod', 'prt', 'compound', 'pobj', 'appos'],
 ['ROOT', 'ac

#### 3. Graphs

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

%matplotlib inline
G=nx.Graph()
G.add_edge('am','I')
G.add_edge('am','Angelina')
nx.draw(G, with_labels=True)
plt.show()

#### 4. Adjacency Matrices

In [None]:
# A class to represent the adjacency list of the node
class AdjNode:
    def __init__(self, data):
        self.vertex = data
        self.next = None


# A class to represent a graph. A graph
# is the list of the adjacency lists.
# Size of the array will be the no. of the
# vertices "V"
class Graph:
    def __init__(self, vertices):
        self.V = vertices
        self.graph = [None] * self.V

    # Function to add an edge in an undirected graph
    def add_edge(self, src, dest):
        # Adding the node to the source node
        node = AdjNode(dest)
        node.next = self.graph[src]
        self.graph[src] = node

        # Adding the source node to the destination as
        # it is the undirected graph
        node = AdjNode(src)
        node.next = self.graph[dest]
        self.graph[dest] = node

    # Function to print the graph
    def print_graph(self):
        for i in range(self.V):
            print("Adjacency list of vertex {}\n head".format(i), end="")
            temp = self.graph[i]
            while temp:
                print(" -> {}".format(temp.vertex), end="")
                temp = temp.next
            print(" \n")


# Driver program to the above graph class
if __name__ == "__main__":
    V = 5
    graph = Graph(V)
    graph.add_edge(0, 1)
    graph.add_edge(0, 4)
    graph.add_edge(1, 2)
    graph.add_edge(1, 3)
    graph.add_edge(1, 4)
    graph.add_edge(2, 3)
    graph.add_edge(3, 4)

    graph.print_graph()

#### 5. Matrices to vectors

### CLASSIFICATION

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()