# Note : Import required packages and dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import time
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from nltk import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
%matplotlib notebook

# Data Science Project

# A. This project is a culmination of various aspects of AI.
## 1. Machine Learning 
## 2. Deep Learning
## 3. Knowledge Graphs

# B. Aim: The project can be classified under Natural Language Processing domain. The project will be aimed at trying to identify unwanted comments from the comment section of  wikipedia discussion pages. It is a multi-headed classification problem, where the comment wil get classified based on different types and each type will have levels. The user can then decide what levels of toxicsity are acceptible in the comment section.   

# C. The following steps will be followed during the course of the project. 
## 1. Importing the data
## 2. Data Preprocessing
###  a. Noise Removal
###  b.Lexicon Normalization
###  c.Lemmatization
###  d.Stemming
###  e.Object Standardization
## 3. Text to Features (Feature Engineering on text data)
###  a.Syntactical Parsing
###  b.Dependency Grammar
###  c.Part of Speech Tagging
###  d.Entity Parsing
###  e.Phrase Detection
###  f. Named Entity Recognition
###  g.Topic Modelling
###  h. N-Grams
###  i.  Statistical features
###  j. TF – IDF
###  h. Frequency / Density Features
###  j. Readability Features
###  k.Word Embeddings
## 4. Modelling 
###  a.Text Classification
###  b.Text Matching
###  c. Levenshtein Distance
###  d. Phonetic Matching
###  e.  Flexible String Matching
###  f. Coreference Resolution
## 5. Testing the accuracy of the Model
###  a. Model on Test Data 
###  c. Human input testing 

# 1. Import the train and test data

In [None]:
with open('train.csv',encoding = 'utf8') as csvDataFile1:
    train_data = pd.read_csv(csvDataFile1)
with open('test.csv',encoding = 'utf8') as csvDataFile2:
    test_data = pd.read_csv(csvDataFile2)
with open('sample_submission.csv',encoding = 'utf8') as csvDataFile3:
    sample_output = pd.read_csv(csvDataFile3)

In [None]:
train_data[:10]

In [None]:
test_data[:10]

In [None]:
sample_output[:10]

# 2. Data processing

## 2.a Noise Removal

## 3. Text to Features (Feature Engineering on text data)

In [None]:
# Reads in both training and testing dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
classes = train.columns[2:].tolist()
print(train.shape, test.shape)
total_comments = train.shape[0] + test.shape[0]
print(total_comments)

In [None]:
# df is the combination of training set comments and testing set comments
merge = pd.concat([train.iloc[:,0:2],test.iloc[:,0:2]])
df = merge.reset_index(drop=True)

In [None]:
#df = pd.read_csv('train.csv')

Should be performing Feature extraction and cleaning on the combined dataset of training instances and testing instances. Here the text features we are considering are as follows:
1) Word Count 2) Unique Word Count 3) Sentence Count 4) Exclamation Mark Count 5) Capital Word Count 6) Percentage unique words in comment 7) Percentage capital words in comment

In [None]:
# count of words
df['word_count']=df["comment_text"].apply(lambda x: len(re.findall(pattern='[a-zA-Z]{2,25}',string=x)))
# count of unique words
df['unique_word_count']=df["comment_text"].apply(lambda x: len(set([i.lower() for i in re.findall(pattern='[a-zA-Z]{2,25}',string=x)])))
# count of sentences
df['sentence_count']=df["comment_text"].apply(lambda x: len(re.findall(pattern='\n[^(\n)|^( +\n)]',string=x))+1)
# count of exclamation marks
df['exclamation_mark_count']=df["comment_text"].apply(lambda x: len(re.findall(pattern='!',string=x))+1)
# count of uppercase words
df['capital_word_count']=df["comment_text"].apply(lambda x: len(re.findall(pattern='[A-Z]{2,25}',string=x)))
# percentage of unique words out of total words
df['perc_unique_words'] = np.round(df['unique_word_count']/df['word_count'],2)
# percentage of capital words out of total words
df['perc_cap'] = np.round(df['capital_word_count']/df['word_count'],2)

In [None]:
# Separates the training and testing features
train_feats = df.iloc[0:len(train),]
test_feats = df.iloc[len(train):,]
print(train_feats.shape, test_feats.shape)

In [None]:
# Join the Tags for each comment with the features we calculated before
train_tags = train.iloc[:,2:]
train_feats=pd.concat([train_feats,train_tags],axis=1)

In [None]:
# Corpus contains just the comments of the combined training and testing datasets 
corpus = merge.comment_text

In [None]:
# Removal of everthing except words and also performs word lemmatization to have a completely cleaned corpus
tokenizer = TweetTokenizer()
def clean_corpus(text):
    text = text.lower()
    text = re.sub("\n"," ",text)
    text = re.sub("\[.*\]"," ",text)
    text = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"," ",text)
    text = re.sub(r"\?"," ",text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub("\d+", "", text)
    words = tokenizer.tokenize(text)
    
    # Performs Word Lemmatization
    wnl = WordNetLemmatizer()
    lemmed_words = []
    for word, tag in pos_tag(words):
        # NN: noun, common, singular or mass
        if tag.startswith("NN"):
            lemmed_words.append(wnl.lemmatize(word, pos='n'))
        # VB: verb, base form
        elif tag.startswith('VB'):
            lemmed_words.append(wnl.lemmatize(word, pos='v'))
        # JJ: adjective or numeral, ordinal
        elif tag.startswith('JJ'):
            lemmed_words.append(wnl.lemmatize(word, pos='a'))
        # R: adverb
        elif tag.startswith('R'):
            lemmed_words.append(wnl.lemmatize(word, pos='r'))    
        else:
            lemmed_words.append(word)  
    
    return(" ".join(lemmed_words))

In [None]:
print(corpus.iloc[0])

In [None]:
clean_corpus(corpus.iloc[0])

In [None]:
cleaned_corpus = corpus.apply(lambda x : clean_corpus(x))

In [None]:
print(cleaned_corpus.iloc[0])

In [None]:
start_unigrams=time.time()
tfidf = TfidfVectorizer(min_df = 200,  max_features = 10000, strip_accents = 'unicode', 
                        analyzer = 'word', ngram_range = (1,1), use_idf = 1, smooth_idf = 1,
                        sublinear_tf = True, stop_words = 'english')
tfidf.fit(cleaned_corpus)
features = np.array(tfidf.get_feature_names())

end_unigrams=time.time()

print("Total time to compute unigrams",end_unigrams-start_unigrams)

In [None]:
train_unigrams =  tfidf.transform(cleaned_corpus.iloc[:train.shape[0]])
test_unigrams = tfidf.transform(cleaned_corpus.iloc[train.shape[0]:])

In [None]:
print(len(features))

In [None]:
#https://buhrmann.github.io/tfidf-analysis.html
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(Xtr, features, grp_ids, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    
    D = Xtr[grp_ids].toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

# modified for multilabel milticlass
def top_feats_by_class(Xtr, features, min_tfidf=0.1, top_n=20):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    cols=train_tags.columns
    for col in cols:
        ids = train_tags.index[train_tags[col]==1]
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

In [None]:
#get top n for unigrams
tfidf_top_n_per_lass=top_feats_by_class(train_unigrams,features)

In [None]:
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
