In [10]:
import pandas as pd
import re
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import gensim
import matplotlib.pyplot as plt

# standardize text

In [11]:
input_file = pd.read_csv('social_media_clean_text.csv')
input_file.columns=['text', 'choose_one', 'class_label']

def standardize_text(df, text_field):
    # normalize by turning all letters into lowercase
    df[text_field] = df[text_field].str.lower()
    # get rid of URLS
    df[text_field] = df[text_field].apply(lambda i: re.sub(r"http\S+", "", i))  
    return df

clean_questions = standardize_text(input_file, "text")
clean_questions.head()

Unnamed: 0,text,choose_one,class_label
0,just happened a terrible car crash,Relevant,1
1,our deeds are the reason of this earthquake m...,Relevant,1
2,"heard about earthquake is different cities, s...",Relevant,1
3,"there is a forest fire at spot pond, geese are...",Relevant,1
4,forest fire near la ronge sask canada,Relevant,1


# tokenize

In [12]:
tokenizer = RegexpTokenizer(r'\w+')
clean_questions["tokens"] = clean_questions["text"].apply(tokenizer.tokenize)
# Note: yahan ham simple <clean_questions['tokens'] = [i.split() for i in clean_questions.text]> bhi kar sakty 
# thy lekin us me kuch galtyan aati hen qomans or quotation mark wagera ki.. is data par ye simple code or jo
# code hamara asal me h in dono ko result almont 1/2 data me same h, or 1/5 me masly hen
clean_questions.head()

Unnamed: 0,text,choose_one,class_label,tokens
0,just happened a terrible car crash,Relevant,1,"[just, happened, a, terrible, car, crash]"
1,our deeds are the reason of this earthquake m...,Relevant,1,"[our, deeds, are, the, reason, of, this, earth..."
2,"heard about earthquake is different cities, s...",Relevant,1,"[heard, about, earthquake, is, different, citi..."
3,"there is a forest fire at spot pond, geese are...",Relevant,1,"[there, is, a, forest, fire, at, spot, pond, g..."
4,forest fire near la ronge sask canada,Relevant,1,"[forest, fire, near, la, ronge, sask, canada]"


In [13]:
## [EDA] Explore words and sentences
all_tokens = [token for tokens in clean_questions["tokens"] for token in tokens]
sentence_lengths = [len(tokens) for tokens in clean_questions["tokens"]]
VOCAB = sorted(list(set(all_tokens)))

print("%s words total, with a vocabulary size of %s" % (len(all_tokens), len(VOCAB)))

132260 words total, with a vocabulary size of 17070


# fit data in CountVectorizer

In [14]:
list_corpus = clean_questions["text"]
list_labels = clean_questions["class_label"]

X_train, X_test, y_train, y_test = train_test_split(list_corpus, 
                                                    list_labels, 
                                                    test_size=0.2, 
                                                    random_state=40)

count_vectorizer = CountVectorizer(analyzer='word', token_pattern=r'\w+')

bow = dict()
# total unique words in our data is 17070, magar jab ham count_vectorizer me is data ko fit 
# karen gy to wo 2054 values ko chor dy ga, jo value chor raha h un me sy kafi  typos ya 
# plural hen
bow["train"] = (count_vectorizer.fit_transform(X_train), y_train)
bow["test"]  = (count_vectorizer.transform(X_test), y_test)

print(bow["train"][0].shape)
print(bow["test"][0].shape)

(7425, 15016)
(1857, 15016)


# fit data in TfidfVectorizer

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w+')

tfidf = dict()
tfidf["train"] = (tfidf_vectorizer.fit_transform(X_train), y_train)
tfidf["test"]  = (tfidf_vectorizer.transform(X_test), y_test)

print(tfidf["train"][0].shape)
print(tfidf["test"][0].shape)

(7425, 15016)
(1857, 15016)


In [16]:
# <tfidf> me unique values relevent to <bow> zyada hen, jahan par <tfidf> me <0> h wahan <bow> me bhi <0> h, 
# agar is k ilawa koi value h to <tfidf> me wo row accur and whole document accur this word sy nikli h
# or <bow> me simple wo wo value h jo is  particular word ki accurence is particular obsevation me h.

a = tfidf['train'][0].toarray()
b = bow['train'][0].toarray()
print('tfidf: ')
print(set(list(a[154])))
print('\nbow: ')
print(set(list(b[154])))

tfidf: 
{0.0, 0.1644577804300159, 0.15244080441349195, 0.40155831974942013, 0.22780705078113467, 0.27598493701225013, 0.17903638074793415, 0.3696967282268077, 0.23848397390927262, 0.2653465371014585, 0.08237693822208582, 0.10890150825569447, 0.23551689351944025, 0.19030608767815718, 0.12870725734996555, 0.1276342175979817, 0.15443718428522887, 0.1920674547330686, 0.18864262620124475, 0.22001515993286655, 0.13998320781004508}

bow: 
{0, 1, 2, 3, 5}


In [18]:
word2vec_path = "GoogleNews-vectors-negative300.bin"
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [19]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_questions_tokens, generate_missing=False):
    embeddings = clean_questions_tokens.apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

# Call the functions
embeddings = get_word2vec_embeddings(word2vec, clean_questions['tokens'])