# Downloading packages

In [None]:
!pip3 install pyspellchecker
!nltk.download('wordnet')

# Imports

In [None]:
import warnings
warnings.filterwarnings("ignore")
import re
import spacy
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os import cpu_count
from multiprocessing import Process, Queue
from spellchecker import SpellChecker
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

%matplotlib inline

# Global Variables

In [None]:
N_PROCESSES = cpu_count()
STEMMER = WordNetLemmatizer()
SPELL_CHECKER = SpellChecker()
STOP_WORDS = set(stopwords.words("english"))

# Utility Functions

In [None]:
def cal_skewness(df, cols, threshold=1):
    skewed_cols = []
    for col in cols:
        if abs(df[col].skew()) >= threshold:
            skewed_cols.append(col)
            
    return skewed_cols

In [None]:
custom_junk_words = ["\x89Û_","\x89ÛÒ","\x89Û÷","&gt;","%20","\x89Ûª","&lt;","\x89ÛÏ","\x89Û\x9d","åÊ","&amp;","\n"]
custom_junk_words = "|".join(custom_junk_words)
custom_junk_words

In [None]:
def preprocess_text(text):
    
    #Remove account tags
    ac_tags = re.compile(r"@\S+")
    text = ac_tags.sub("", text)
    
    #Remove Junk 
    text = re.sub(custom_junk_words," ",text)
    
    # Removing URLs
    url = re.compile(r"https?://\S+|www\.\S+")
    text = url.sub("", text)
    
    # Removing HTML tags & contents inside it
    html=re.compile("<.*?>.*</?.*?>")
    text = html.sub("", text)
    
    # Removing non-alpha characters
    punct = re.compile(r"[^a-zA-Z\s]")
    text = punct.sub("", text)
    
    # Remove single and double lettered words
    text = ' '.join([('' if len(word)<=3 else word) for word in text.split(' ')]).strip()
    
    # Remove extra white spaces
    extra_white_spaces = re.compile("\s{2,}")
    text = extra_white_spaces.sub(" ", text)
    
    
    # Correcting misspelled words
    splitted_text = text.split()
    misspelled_words = set(SPELL_CHECKER.unknown(splitted_text))
    if len(misspelled_words) > 0:
        corrected_text = []
        for word in splitted_text:
            if word in misspelled_words:
                corrected_text.append(SPELL_CHECKER.correction(word))
            else:
                corrected_text.append(word)
        text = " ".join(corrected_text)
    
        #     Remove Stopwords and Lemmatize
    text = " ".join([STEMMER.lemmatize(token) for token in text.lower().split() if token not in STOP_WORDS])
    
        # Remove single and double lettered words
    text = ' '.join([('' if len(word)<=3 else word) for word in text.split(' ')]).strip()
    text = extra_white_spaces.sub(" ", text)

    return text

In [None]:
class PreprocessConcurrent:
    def __init__(self, chunk_size=100):
        self.chunk_size = chunk_size
    
    def preprocess_texts(self, text_series):
        start_chunk_idx, end_chunk = 0, text_series.shape[0]
        processes, q = list(), Queue()
        while start_chunk_idx < end_chunk:
            last_chunk_start_idx, cur_process_idx = start_chunk_idx, 1
            while cur_process_idx <= N_PROCESSES and start_chunk_idx < end_chunk:
                if start_chunk_idx+self.chunk_size <= end_chunk:
                    p = Process(target=self._preprocess_text_parallel, args=(
                        text_series[start_chunk_idx:start_chunk_idx+self.chunk_size], q
                    ))
                else:
                    p = Process(target=self._preprocess_text_parallel, args=(
                        text_series[start_chunk_idx:end_chunk], q
                    ))
                p.start()
                processes.append(p)
                cur_process_idx += 1
                start_chunk_idx += self.chunk_size
            
            print(f"Currently processing chunks from {last_chunk_start_idx} to {start_chunk_idx} "
                  f"out of total {end_chunk} chunks")
            while len(processes) > 0:
                p = processes.pop()
                p.join()

            while not q.empty():
                start, end, preprocessed_text_series = q.get()
                text_series[start:end].update(preprocessed_text_series)
                
        q.close()
    
    def _preprocess_text_parallel(self, texts, q):
        for idx, text in texts.items():
            texts[idx] = preprocess_text(text)

        q.put([texts.index[0], texts.index[-1], texts])

# Preparing dataset & EDA

In [None]:
train_df = pd.read_csv("../input/nlp-getting-started/train.csv")
test_df = pd.read_csv("../input/nlp-getting-started/test.csv")
train_labels = train_df["target"]
train_df.drop(columns=["target"], inplace=True)
print(f"Training shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# Feature generations

In [None]:
start_time = time.time()
concurrency = PreprocessConcurrent()
print("Original:\n", train_df.text[10:20])
concurrency.preprocess_texts(train_df.text)
print("Processed:\n", train_df.text[10:20])
print("--- %s Minutes ---" % ((time.time() - start_time)/60))

In [None]:
vectorizer = TfidfVectorizer(
    max_features=2000, strip_accents="ascii",
    ngram_range=(1,1),max_df=0.9
)
X = vectorizer.fit_transform(train_df.text).todense()

In [None]:
vectorizer.vocabulary_

# Model Training

In [None]:
# Gaussian Naive Bayes
gaussian_nb = GaussianNB()
scores = cross_val_score(gaussian_nb, X, train_labels, scoring="f1",n_jobs=-1)
plt.plot(scores)
plt.show()

# Testing

In [None]:
X_test = vectorizer.transform(test_df.text).todense()

In [None]:
start_time = time.time()
print("Original:\n", test_df.text[10:20])
concurrency.preprocess_texts(test_df.text)
print("Processed:\n", test_df.text[10:20])
X_test = vectorizer.transform(test_df.text).todense()
print("--- %s Minutes ---" % ((time.time() - start_time)/60))

In [None]:
gaussian_nb.fit(X, train_labels)
submission = pd.read_csv("Data/sample_submission.csv")
submission["target"] = gaussian_nb.predict(X_test)
submission.to_csv("result.csv", index=False)
submission.head()