# Downloading packages

In [None]:
!pip3 install pyspellchecker
!nltk.download('wordnet')

# Imports

In [None]:
import re
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from spellchecker import SpellChecker
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline

# Global Variables

In [None]:
STEMMER = WordNetLemmatizer()
SPELL_CHECKER = SpellChecker()
STOP_WORDS = set(stopwords.words("english"))

# Utility Functions

In [None]:
def preprocess_text(text):
    # Removing URLs
    url = re.compile(r"https?://\S+|www\.\S+")
    text = url.sub("", text)
    
    # Removing HTML tags & contents inside it
    html=re.compile("<.*?>.*</?.*?>")
    text = html.sub("", text)
    
    # Removing non-alpha characters
    punct = re.compile(r"[^a-zA-Z\s']")
    text = punct.sub("", text)
    
    # Remove extra white spaces
    extra_white_spaces = re.compile("\s{2,}")
    text = extra_white_spaces.sub(" ", text)
    
    # Correcting misspelled words
    splitted_text = text.split()
    misspelled_words = set(SPELL_CHECKER.unknown(splitted_text))
    if len(misspelled_words) > 0:
        corrected_text = []
        for word in splitted_text:
            if word in misspelled_words:
                corrected_text.append(SPELL_CHECKER.correction(word))
            else:
                corrected_text.append(word)
        text = " ".join(corrected_text)

    return text

In [None]:
def tokenize_text(text):
    return([STEMMER.lemmatize(token) for token in text.lower().split() if token not in STOP_WORDS])

# Preparing dataset & EDA

In [None]:
train_df = pd.read_csv("../input/nlp-getting-started/train.csv")
test_df = pd.read_csv("../input/nlp-getting-started/test.csv")
train_labels = train_df["target"]
train_df.drop(columns=["target"], inplace=True)
print(f"Training shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# Feature generations

In [None]:
vectorizer = TfidfVectorizer(
    preprocessor=preprocess_text, tokenizer=tokenize_text, max_df=0.7, min_df=0.2, strip_accents="ascii", 
)
vectorizer.fit(train_df.text)

In [None]:
len(f"Vocabukary size is: {vectorizer.vocabulary_.keys()}")