<a href="https://colab.research.google.com/github/anmaxwell/UniNotebooks/blob/master/2AssesB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install scattertext

In [0]:
pip install "git+https://github.com/facebookresearch/fastText.git"

Install all necessary packages

In [0]:
import fasttext.util
import pandas as pd
import re
import scattertext as st
import spacy

from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from scattertext import CorpusFromPandas, produce_scattertext_explorer
from sklearn.model_selection import train_test_split

Read in data and look at first item

In [0]:
df = pd.read_csv('agr_en_train.csv', names=['unique_id','text','aggression-level'], sep=',')
print(df.iloc[0])

Check for missing values

In [0]:
df.isna().values.any()

Count the occurrences per Aggression Level

In [0]:
df['aggression-level'].value_counts() 

Identify the top occurring words per Aggression Level

In [0]:
nlp = spacy.load('en')
df['parsed'] = df.text.apply(nlp)
data = st.CorpusFromParsedDocuments(df, category_col='aggression-level', 
                                      parsed_col='parsed').build().remove_terms(nlp.Defaults.stop_words, ignore_absences=True)

freq_df = data.get_term_freq_df()
oag_tw = freq_df.sort_values(by=['OAG freq'], ascending=False)
oag_tw = oag_tw.drop(oag_tw.columns[[1,2]], axis=1)
nag_tw = freq_df.sort_values(by=['NAG freq'], ascending=False)
nag_tw = nag_tw.drop(nag_tw.columns[[0,2]], axis=1)
cag_tw = freq_df.sort_values(by=['CAG freq'], ascending=False)
cag_tw = cag_tw.drop(cag_tw.columns[[0,1]], axis=1)

print(oag_tw.head())
print(nag_tw.head())
print(cag_tw.head())

Replace the text labels with values

In [0]:
df['aggression-level'] = df['aggression-level'].replace({ 'OAG' : 0, 'NAG' : 1, 'CAG' : 2 }) 

Clean up the text to remove non-ASCII characters and set all text to lowercase (potential to remove numbers and stopwords)

In [0]:
def clean_up(text):
  text = re.sub(r'[^\x00-\x7f]',r' ',text)
  text = text.replace('&', ' and ')
  text = text.replace('@', ' at ')
  text = text.lower

  return text

Load the fasttext model

In [0]:
fasttext.util.download_model('en', if_exists='ignore') 
ft = fasttext.load_model('cc.en.300.bin')

In [0]:
unique_words = len(set(df['text'].str.cat(sep=' ').lower().split()))
data_count = len(df)
dims = ft.get_dimension()