# Imports and Read Data

In [1]:
from __future__ import (
    print_function,
    division
)

import matplotlib.pyplot as plt
import stopwordsiso as swiso
import seaborn as sns
import pandas as pd
import numpy as np
import cleantext
import warnings
import random
import string
import pickle
import spacy
import json
import nltk
import time
import abc
import os 
import re
import sys 
sys.path.append('./src')

from spellchecker import SpellChecker
from stop_words import get_stop_words
from collections import OrderedDict
from nltk.stem import PorterStemmer
from collections import Counter
from nltk.corpus import stopwords
from wordcloud import WordCloud


from textblob import (
    TextBlob,
    Word
)

from typing import (
    Callable,
    Iterable,
    List,
    Union,
    Tuple,
)

from utils import (
    Classifier,
    Pipeline,
    json_print,
    timeit,
    random_seed,
    #save_obj
)
from supervised import (
    KNeighborsClassifier,
    MultiNominalNaiveBayes,
    BernaulliNaiveBayes
)
from feature import (
    BackwardElimination,
    #mutual_information
)

In [3]:
df = pd.read_csv('../data/cleaned_preprocessed_data.csv')

# Post-preprocessing

In [4]:
cols = [
    'id',
    'text',
    'clean_text',
    'user',
    'sentiment',
]

df = df[cols]

df['clean_text'] = df['clean_text'].str.replace(",", '')
df['clean_text'] = df['clean_text'].str.replace(".", '')
df['clean_text'] = df['clean_text'].str.strip()

df.drop(
    ['id'],
    axis = 1,
    inplace = True
)


df['doc_count'] = df['clean_text'].apply(lambda t: len(
    str(t).split()
    )
)

df.drop(
    df[df['doc_count'] <= 2].index,
    inplace = True
)

  df['clean_text'] = df['clean_text'].str.replace(".", '')


In [5]:
df.head()

Unnamed: 0,text,clean_text,user,sentiment,doc_count
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww bummer shoulda david carr day,_TheSpecialOne_,0,6
1,is upset that he can't update his Facebook by ...,upset updat facebook text result school blah,scotthamilton,0,7
2,@Kenichan I dived many times for the ball. Man...,dive time ball manag save rest bound,mattycus,0,7
3,my whole body feels itchy and like its on fire,bodi feel itchi,ElleCTF,0,3
7,@LOLTrish hey long time no see! Yes.. Rains a...,hey time rain bit bit lol fine,coZZ,0,7


In [6]:
df.isna().sum()

text          0
clean_text    0
user          0
sentiment     0
doc_count     0
dtype: int64

# Construction of Corpus

In [7]:
df_vocab = df['clean_text'].str.split(expand=True).stack().value_counts().reset_index()
df_vocab.columns = [
    'word',
    'frequency'
] 
df_vocab.head(10)

Unnamed: 0,word,frequency
0,day,94954
1,love,73157
2,time,59677
3,lol,49958
4,feel,44326
5,watch,40297
6,amp,40251
7,night,40220
8,hope,34472
9,tomorrow,30858


In [8]:
len(df_vocab)

337107

In [25]:
word_freq = Counter(
    df['clean_text'].str.cat(
        sep = ' '
    ).split()
)

print(word_freq.most_common(10))

[('day', 94954), ('love', 73157), ('time', 59677), ('lol', 49958), ('feel', 44326), ('watch', 40297), ('amp', 40251), ('night', 40220), ('hope', 34472), ('tomorrow', 30858)]


# Post-processing with Word Count Data

In [26]:
dict_freq = dict(word_freq)
freq_threshold = 1000

In [27]:
df['clean_freq_removed_text'] = df['clean_text'].apply(
    lambda text : " ".join(
            [
                word for word in text.split() if dict_freq[word] > freq_threshold
        ]
    )
)

In [28]:
df['doc_count_clean_freq_removed_text'] = df['clean_freq_removed_text'].apply(
    lambda t: len(
        str(t).split()
    )
)

df.drop(
    df[df['doc_count_clean_freq_removed_text'] <= 2].index,
    inplace = True
)

In [29]:
df_vocab_greq_removed = df['clean_freq_removed_text'].str.split(expand=True).stack().value_counts().reset_index()
df_vocab_greq_removed.columns = [
    'word',
    'frequency'
] 

In [36]:
class Vocabulary:
    def __init__(
        self,
        vocab_dict,
        
    ):
        super(Vocabulary, self).__init__()

        assert 'word' in vocab_dict.keys() and 'frequency' in vocab_dict.keys()

        self.vocab_dict = vocab_dict

        self.id2word = vocab_dict['word']
        self.frequency = vocab_dict['frequency']

        self.word2id = {
            word: i for i, word in enumerate(self.id2word)
        } 

    def __getitem__(self, idx):

        if isinstance(idx, (list, np.ndarray)):
            return [self.id2word[i] for i in idx]

        return self.id2word[idx]

    def __str__(self):
        return json_print(self.id2word)

    def __repr__(self):
        return json_print(self.id2word)


    def __len__(self):
        return len(self.word2id)

    def get_vocab(self):
        return self.word2id

    def get_frequency_dict(self):
        return {self.id2word[i] : freq for i, freq in self.frequency.items()}

    def save(self, filename: str) -> None:
        self.save_obj(
            self.__dict__,
            filename
        )

    def load(self, filename: str) -> None:
        self.__dict__ = self.load_obj(filename)

    def save_obj(
        self,
        obj:object,
        path:str = None
    ) -> None:
        """ Saves Python Object as pickle"""
        with open(path + '.pkl', 'wb') as f:
            pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


    def load_obj(
        self,
        path:str = None
    ) -> object:
        """ Loads Python Object from pickle"""
        with open(path + '.pkl', 'rb') as f:
            return pickle.load(f)


vocab = Vocabulary(
    df_vocab.to_dict()
)


vocab_freq_removed = Vocabulary(
    df_vocab_greq_removed.to_dict()
)




In [37]:
vocab.save('data/vocabulary')
vocab_freq_removed.save('data/vocabulary_freq_removed')

In [31]:
df.to_parquet('data/final_training_data.parquet')