# COMP90051 Authorship Attribution 
----------
    
`1.` [Loading Data](#loading)

    1.1 Loading Packages
    1.2 Loading Train, Test



<a class="anchor" id="loading"></a>
# Loading Data

In [83]:
import re, pickle, os, string
import numpy as np 
import pandas as pd 
import nltk
import string
import spacy
from nltk.corpus import stopwords
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex
from spacy.tokenizer import Tokenizer
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [75]:
train_data = None
test_data = None

def load_data():
    global train_data, test_data
    train_data = pd.read_csv('train_tweets.txt', delimiter="\t", header=None)
    test_data = pd.read_csv('test_tweets_unlabeled.txt', delimiter="\t", header = None)

load_data()
train_data.columns = ['Author', 'Tweet']
print(train_data)

        Author                                              Tweet
0         8746     @handle Let's try and catch up live next week!
1         8746  Going to watch Grey's on the big screen - Thur...
2         8746  @handle My pleasure Patrick....hope you are well!
3         8746  @handle Hi there! Been traveling a lot and lot...
4         8746  RT @handle Looking to Drink Clean & Go Green? ...
5         8746  RT @handle: Ft. Hood officials confirm the 2 o...
6         8746  RT @handle: Mickey Mouse is Getting a Make Ove...
7         8746           @handle How did u get the invite Justin?
8         8746  @handle I think I am still a good friend of he...
9         8746  @handle I remember! I am fine - how are u? Wha...
10        8746     @handle That's great - good for the coach!!!!!
11        8746  @handle I don't want to picture u sitting on i...
12        8746  @handle D- Thanks for the RTs....are you going...
13        8746           @handle Grrr....you must be going crazy!
14        

In [3]:
def load_pickle(filepath):
    documents_f = open(filepath, 'rb')
    file = pickle.load(documents_f)
    documents_f.close()
    
    return file

def save_pickle(data, filepath):
    save_documents = open(filepath, 'wb')
    pickle.dump(data, save_documents)
    save_documents.close()

In [87]:
def custom_tokenizer(nlp):
    infix_re = re.compile(r'''[.\?\:\;\...\‘\’\`\“\”\"\'~]''')
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=None)
spacy_nlp = spacy.load("en")
spacy_nlp.tokenizer = custom_tokenizer(spacy_nlp)

stp = [word for word in list(stopwords.words('english') + [ "'s", "'m", "ca"])
        if word not in ["no", "not"] and word.rfind("n't") == -1]

class PreProcessor(object):
    '''Pre-processor which cleans text, lemmatises, removes stop words and punctuation, 
    returns df of processed text.'''

    def __init__(self):
        self._stopWordList = stp
        self._punct_removal = list(string.punctuation)

#     def _tokenize_text(self, sample):
#         '''tokenises sentences in order to lemmatise, remove stop words and punctuation, 
#         returns string of processed text'''

#         # get tokens using spacy
#         tokens = spacy_nlp(sample)

#         # lemmatising tokens
#         tokens = [t.lemma_.strip()
#                   if t.lemma_ != "-PRON-"
#                   else t.lower_
#                   for t in tokens]

#         # stopword and punctuation removal
#         tokens = [t.lower() for t in tokens
#                   if (t not in self._stopWordList and t not in self._punct_removal)]

#         processed_text = " ".join(tokens)
#         return processed_text
    
    def remove_url(self, text):
        result = re.sub(r"http\S+", "", text)
        return result
    
    def check_url(self, text):
        url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text) 
        if len(url) != 0:
            return True
        return False
    
    def transform_text(self, data):
        
        '''applies the clean and tokenise methods to the texts, 
        encodes the target variable to numerical. 
        Option to set pickle to True to save clean df'''
        no_punct_translator=str.maketrans('','',string.punctuation)
        
        data['words'] = data['Tweet'].apply(lambda row: self.remove_url(str(row))).apply(lambda t: nltk.word_tokenize(t.translate(no_punct_translator).lower()))
        data['URL'] = data['Tweet'].apply(lambda row: self.check_url(str(row)))
        data['word_count'] = data['words'].apply(lambda words: len(words))
        data['sentence_length'] = data['words'].apply(lambda w: sum(map(len, w)))
        data['text_length'] = data['Tweet'].apply(lambda t: len(str(t)))
        
        return data

In [88]:
processor = PreProcessor()
clean_train = processor.transform_text(train_data)
clean_train.head(20)

Unnamed: 0,Author,Tweet,words,URL,word_count,sentence_length,text_length
0,8746,@handle Let's try and catch up live next week!,"[handle, lets, try, and, catch, up, live, next...",False,9,35,46
1,8746,Going to watch Grey's on the big screen - Thur...,"[going, to, watch, greys, on, the, big, screen...",False,10,49,66
2,8746,@handle My pleasure Patrick....hope you are well!,"[handle, my, pleasure, patrickhope, you, are, ...",False,7,37,49
3,8746,@handle Hi there! Been traveling a lot and lot...,"[handle, hi, there, been, traveling, a, lot, a...",False,25,100,132
4,8746,RT @handle Looking to Drink Clean & Go Green? ...,"[rt, handle, looking, to, drink, clean, go, gr...",False,18,86,109
5,8746,RT @handle: Ft. Hood officials confirm the 2 o...,"[rt, handle, ft, hood, officials, confirm, the...",False,17,86,105
6,8746,RT @handle: Mickey Mouse is Getting a Make Ove...,"[rt, handle, mickey, mouse, is, getting, a, ma...",True,10,43,76
7,8746,@handle How did u get the invite Justin?,"[handle, how, did, u, get, the, invite, justin]",False,8,31,40
8,8746,@handle I think I am still a good friend of he...,"[handle, i, think, i, am, still, a, good, frie...",False,12,40,55
9,8746,@handle I remember! I am fine - how are u? Wha...,"[handle, i, remember, i, am, fine, how, are, u...",False,11,37,54


In [90]:
import qgrid
qgrid_widget = qgrid.show_grid(clean_train, show_toolbar=True)
qgrid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [91]:
save_pickle(clean_train, os.path.join('train_data.p'))

In [78]:
df = train_data['Author'].unique()
print(len(df))
train_data.groupby('Author').count()

9293


Unnamed: 0_level_0,Tweet
Author,Unnamed: 1_level_1
2,17
3,11
4,18
6,39
7,37
8,34
9,55
10,86
11,19
12,40


In [92]:
clean_train.groupby(['Author'])['word_count'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,17.0,15.823529,5.468493,5.0,13.00,17.0,20.00,26.0
3,11.0,25.363636,33.950766,7.0,13.00,16.0,19.00,127.0
4,18.0,17.555556,3.275977,12.0,16.00,17.0,21.00,22.0
6,39.0,16.076923,6.558982,3.0,11.00,15.0,23.00,26.0
7,37.0,11.756757,7.251074,0.0,6.00,11.0,15.00,26.0
8,34.0,8.323529,2.760331,3.0,6.25,8.0,10.00,15.0
9,55.0,10.581818,6.172492,2.0,5.00,9.0,14.50,27.0
10,86.0,13.325581,6.984760,1.0,8.00,12.5,18.75,28.0
11,19.0,4.105263,2.579723,2.0,3.00,3.0,5.00,13.0
12,40.0,13.525000,5.546944,3.0,9.00,12.5,18.00,25.0


In [93]:
load_pickle("train_data.p")

Unnamed: 0,Author,Tweet,words,URL,word_count,sentence_length,text_length
0,8746,@handle Let's try and catch up live next week!,"[handle, lets, try, and, catch, up, live, next...",False,9,35,46
1,8746,Going to watch Grey's on the big screen - Thur...,"[going, to, watch, greys, on, the, big, screen...",False,10,49,66
2,8746,@handle My pleasure Patrick....hope you are well!,"[handle, my, pleasure, patrickhope, you, are, ...",False,7,37,49
3,8746,@handle Hi there! Been traveling a lot and lot...,"[handle, hi, there, been, traveling, a, lot, a...",False,25,100,132
4,8746,RT @handle Looking to Drink Clean & Go Green? ...,"[rt, handle, looking, to, drink, clean, go, gr...",False,18,86,109
5,8746,RT @handle: Ft. Hood officials confirm the 2 o...,"[rt, handle, ft, hood, officials, confirm, the...",False,17,86,105
6,8746,RT @handle: Mickey Mouse is Getting a Make Ove...,"[rt, handle, mickey, mouse, is, getting, a, ma...",True,10,43,76
7,8746,@handle How did u get the invite Justin?,"[handle, how, did, u, get, the, invite, justin]",False,8,31,40
8,8746,@handle I think I am still a good friend of he...,"[handle, i, think, i, am, still, a, good, frie...",False,12,40,55
9,8746,@handle I remember! I am fine - how are u? Wha...,"[handle, i, remember, i, am, fine, how, are, u...",False,11,37,54
