In [1]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import os
#from contractions import contractions_dict
#import contractions
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from spacy.lang.en import STOP_WORDS as spacy_stopwords
import spacy
from tqdm import tqdm
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from collections import Counter

In [None]:
!python -m spacy download en_core_web_md

In [None]:
nlp = spacy.load("en_core_web_md",disable=["ner","parser"])

In [None]:
combined_stopwords = set(stopwords.words('english')).union(set(spacy_stopwords))

In [None]:
data = pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv",
                  encoding="latin-1",header=None,names=["sentiment","id","date","flag","username","text"])

In [None]:
data.head()

In [None]:
data.drop(labels=data.columns[1:5],axis=1,inplace=True)

In [None]:
data.head()

In [None]:
def normalize_tweet(tweet):

    return tweet.lower()

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(normalize_tweet,data["text"]))

In [None]:
def fix_contractions(tweet):

    return contractions.fix(tweet)

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(fix_contractions,data["text"]))

In [None]:
def remove_noisy_tokens(tweet):

    return re.sub(pattern=r'@[a-zA-Z0-9 ]+|#[a-zA-Z0-9 ]+|\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*|\W+|\d+|<("[^"]*"|\'[^\']*\'|[^\'">])*>|_+|[^\u0000-\u007f]+',
                 string=tweet,repl=" ")

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(remove_noisy_tokens,data["text"]))

In [None]:
def remove_remaining_noisy_tokens(tweet):

    return re.sub(pattern=r'\b\w\b|[^\u0000-\u007f]+|_+|\W+',
                 string=tweet,repl=" ")

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(remove_remaining_noisy_tokens,data["text"]))

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(word_tokenize,data["text"]))

In [None]:
def is_stopword(token):

    return token not in combined_stopwords

In [None]:
def remove_stopwords(tokenized_tweet):

    return [token for token in tokenized_tweet if is_stopword(token)]

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(remove_stopwords,data["text"]))

In [None]:
with open("stopwords_removed.pkl","wb") as file_handle:
    pickle.dump(data["text"],file_handle)

In [None]:
def lemmatize_tweet(tokenized_tweet):

    raw_tweet = " ".join(tokenized_tweet)
    doc = nlp(raw_tweet)
    lemmatized_tweet = list()

    for token in doc:
        lemmatized_tweet.append(token.lemma_)

    return lemmatized_tweet

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(tqdm(pool.map(lemmatize_tweet,data["text"])))

In [None]:
with open("lemmatized_tweets.pkl","wb") as file_handle:
    pickle.dump(data["text"],file_handle)

In [2]:
data = pd.DataFrame()

with open("lemmatized_tweets.pkl","rb") as file_handle:
    data["text"] = pickle.load(file_handle)

In [3]:
converted_raw_text = list(data["text"].apply(lambda x: " ".join(x)))

In [4]:
converted_raw_text = list(filter(lambda x: len(x) > 0,converted_raw_text))

In [5]:
len(converted_raw_text)

1408026

In [6]:
vocab = set()

for cleaned_tweet in converted_raw_text:
    vocab.update(set(cleaned_tweet.split(" ")))

In [7]:
len(vocab)

273488

In [8]:
vectorizer = TfidfVectorizer()
vectorized_text = vectorizer.fit_transform(converted_raw_text)

In [9]:
vectorized_text.shape

(1408026, 273469)

In [14]:
cumulative_tfs = Counter()
for cleaned_tweet in data["text"]:

    cumulative_tfs.update(cleaned_tweet)

In [17]:
most_frequent_tokens = cumulative_tfs.most_common(30000)
most_frequent_tokens = dict(most_frequent_tokens)
truncated_vocab = list(most_frequent_tokens.keys())

truncated_vocab2idx = dict(zip(truncated_vocab,range(len(truncated_vocab))))

In [19]:
vectorizer = TfidfVectorizer(vocabulary=truncated_vocab2idx)
vectorized_text = vectorizer.fit_transform(converted_raw_text)

