In [1]:
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import os
import warnings
import pandas as pd
import numpy as np

In [4]:
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

In [5]:
feed_hashtag = pd.read_csv(os.getcwd() + '/Datasets/feed_hashtag_long.csv')
feed_hashtag['createdAt'] = pd.to_datetime(feed_hashtag['createdAt'])
print(feed_hashtag.shape)
feed_hashtag.head(1)

(10776, 11)


Unnamed: 0,contentId,authorId_content,createdAt,tagIds,text,type,hashtagId,authorId_hashtag,hashtag,isActive,isPrimary
0,5df457077e4ea600161b8e53,5df4561e7e4ea600161b8e3e,2019-12-14 03:29:11.865,5df39410b2694d0016bdb724,"Hello, my name is Kalyani. I have done my bach...",QUESTION,5df39410b2694d0016bdb724,5df38de0b2694d0016bdb71f,inspiration,True,True


In [6]:
feed_hashtag = feed_hashtag[feed_hashtag['isPrimary'] == True]
feed_hashtag = feed_hashtag[~feed_hashtag['text'].isna()]
print("Data Size:", feed_hashtag.shape[0])
print("Unique Content:", feed_hashtag['contentId'].nunique())
print("Unique tags:", feed_hashtag['hashtag'].nunique())

Data Size: 2724
Unique Content: 1708
Unique tags: 11


In [7]:
feed_hashtag['createDate'] = feed_hashtag['createdAt'].dt.date
feed_hashtag.head(1)

Unnamed: 0,contentId,authorId_content,createdAt,tagIds,text,type,hashtagId,authorId_hashtag,hashtag,isActive,isPrimary,createDate
0,5df457077e4ea600161b8e53,5df4561e7e4ea600161b8e3e,2019-12-14 03:29:11.865,5df39410b2694d0016bdb724,"Hello, my name is Kalyani. I have done my bach...",QUESTION,5df39410b2694d0016bdb724,5df38de0b2694d0016bdb71f,inspiration,True,True,2019-12-14


In [8]:
feed_hashtag.groupby(['createDate']).agg({'contentId':'nunique', 'tagIds':'count'}).head()

Unnamed: 0_level_0,contentId,tagIds
createDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-12-12,261,262
2019-12-14,125,133
2019-12-15,2,2
2019-12-16,15,24
2019-12-17,4,8


In [9]:
feed_hashtag = feed_hashtag.sort_values(['contentId'])

In [10]:
feed_hashtag.groupby(['contentId'])['hashtag'].count().reset_index().sort_values(['hashtag'], ascending = False).head()

Unnamed: 0,contentId,hashtag
794,5e7cbbbbad47950024ffd35c,4
1322,5e92bfa9f04afc002e0974f4,4
1156,5e88746bbac5dc004dc55f96,4
861,5e7e59ec8735d5002991dd78,4
1148,5e883e2239bde20029d8280a,4


In [12]:
feed_hashtag_rollup = feed_hashtag.groupby(['contentId', 'authorId_content', 'createdAt', 'text', 'type', 'isActive', 'isPrimary'])['hashtag'].agg([('hashtag_count','count'), ('hashtag_list', ','.join)]).reset_index()
print(feed_hashtag_rollup.shape)
feed_hashtag_rollup.sort_values(['hashtag_count'],ascending=False).head(2)

(1708, 9)


Unnamed: 0,contentId,authorId_content,createdAt,text,type,isActive,isPrimary,hashtag_count,hashtag_list
794,5e7cbbbbad47950024ffd35c,5e7cb9c8ad47950024ffd357,2020-03-26 14:27:07.376,Hi KoolKanyas #careeradvice #career #engineer...,QUESTION,True,True,4,"career-switch,inspiration,starting-out,career-..."
1322,5e92bfa9f04afc002e0974f4,5e92b67680326b00285f4f42,2020-04-12 07:13:45.175,I am Janhavi . I have my own Event design and ...,POST,True,True,4,"career-growth,marketingQueries,selfcare,networ..."


In [13]:
train = feed_hashtag[feed_hashtag['createdAt'] <= pd.to_datetime('2020-04-16')].copy()
print("train data:", train.shape[0])
test = feed_hashtag[feed_hashtag['createdAt'] > pd.to_datetime('2020-04-16')].copy()
print("test data:", test.shape[0])

train data: 2214
test data: 510


In [14]:
feed_token = [word_tokenize(sent.lower()) for sent in train.text]

In [15]:
stop_punct = list(punctuation)
stop_nltk = stopwords.words("english")
stop_updated = stop_nltk + stop_punct

In [16]:
def del_stop(sent):
    return [term for term in sent if term not in stop_updated]

In [17]:
feed_token_clean = [del_stop(sent) for sent in feed_token]

In [18]:
train['text_clean'] = [" ".join(sent) for sent in feed_token_clean]
train.head(1)

Unnamed: 0,contentId,authorId_content,createdAt,tagIds,text,type,hashtagId,authorId_hashtag,hashtag,isActive,isPrimary,createDate,text_clean
41,5df454f37e4ea600161b8e2e,5df454297e4ea600161b8e25,2019-12-14 03:20:19.029,5df39412b2694d0016bdb72b,Hi I am Shreshtha. I have been baking for the ...,QUESTION,5df39412b2694d0016bdb72b,5df38de0b2694d0016bdb71f,career-growth,True,True,2019-12-14,hi shreshtha baking last 5 years brand associa...


In [19]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['text_clean'], train['hashtag'])

In [20]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y_enc = encoder.fit_transform(train_y)
valid_y_enc = encoder.fit_transform(valid_y)

In [21]:
mapping = pd.DataFrame({'hashtag':train_y, 'hashtag_encoding':train_y_enc})
mapping = mapping.drop_duplicates()
mapping.sort_values(['hashtag_encoding'])

Unnamed: 0,hashtag,hashtag_encoding
1872,Finance,0
8007,all-things-legal,1
10469,career-growth,2
9209,career-switch,3
10387,inspiration,4
4261,marketingQueries,5
5206,motherhood,6
10633,networking,7
8771,selfcare,8
5132,speaking-out,9


# Count Vectorizer as Features

In [22]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(train['text_clean'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)

In [23]:
# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

# TFIDF Vectors as Features

In [24]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train['text_clean'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# Model Building

In [25]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [26]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: ", accuracy)

NB, Count Vectors:  0.2075812274368231


In [27]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

NB, WordLevel TF-IDF:  0.2292418772563177


In [28]:
classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_tfidf, train_y_enc)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
#predictions = classifier.predict(xvalid_count)

In [29]:
prob_predictions = classifier.predict_proba(xvalid_tfidf)
prob_predictions[0]

array([0.0069119 , 0.00314012, 0.0879011 , 0.03007183, 0.27388804,
       0.07043501, 0.00645206, 0.04766004, 0.0565681 , 0.29924179,
       0.11773002])

In [34]:
prob_prediction_df = pd.DataFrame(prob_predictions, index = valid_y.index)
print(prob_prediction_df.shape)
prob_prediction_df['index_col'] = prob_prediction_df.index
prob_prediction_df.head()

(554, 11)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,index_col
10482,0.006912,0.00314,0.087901,0.030072,0.273888,0.070435,0.006452,0.04766,0.056568,0.299242,0.11773,10482
6451,0.003525,0.001647,0.026129,0.010634,0.850486,0.022937,0.003432,0.014145,0.01624,0.024009,0.026815,6451
5311,0.00267,0.001045,0.171724,0.029948,0.077921,0.075104,0.002234,0.098589,0.017309,0.044947,0.478508,5311
7643,0.001595,0.00059,0.132083,0.015577,0.274687,0.108526,0.00154,0.028894,0.047549,0.122294,0.266664,7643
8340,0.005704,0.001734,0.230743,0.029436,0.191992,0.065051,0.003556,0.041281,0.032372,0.101076,0.297056,8340


In [None]:
prob_prediction_df_long = pd.melt(prob_prediction_df, id_vars='index_col', value_vars=range(11))
prob_prediction_df_long.columns = ['index_col', 'hashtag_encoding', 'predicted_prob']
prob_prediction_df_long['hashtag_rank'] = prob_prediction_df_long.groupby(['index_col'])['predicted_prob'].rank(ascending=False)
prob_prediction_df_long = prob_prediction_df_long.sort_values(['index_col', 'hashtag_rank'])
prob_prediction_df_long = pd.merge(prob_prediction_df_long, mapping, how = 'left', on = 'hashtag_encoding')
prob_prediction_df_long['pred_rank'] = 'pred_rank_' + prob_prediction_df_long['hashtag_rank'].astype(int).map(str)
prob_prediction_df_long = prob_prediction_df_long.sort_values(['index_col', 'hashtag_rank'])
prob_prediction_df_long.head(11)

In [None]:
prob_prediction_df_wide = prob_prediction_df_long.pivot_table(index = ['index_col'], columns = 'pred_rank', values = ['hashtag'], aggfunc=lambda x: ''.join(x))
prob_prediction_df_wide = prob_prediction_df_wide.reset_index()
prob_prediction_df_wide.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in prob_prediction_df_wide.columns]
prob_prediction_df_wide.index = prob_prediction_df_wide.index_col
print(prob_prediction_df_wide.shape)
prob_prediction_df_wide.head()


In [None]:
validation = train[train.index.isin(valid_y.index)]
print(validation.shape)
validation_rollup = validation.groupby(['contentId', 'authorId_content', 'createdAt', 'text', 'type', 'isActive', 'isPrimary'])['hashtag'].agg([('hashtag_count','count'), ('hashtag_list', ','.join)]).reset_index()
print(validation_rollup.shape)
#validation_rollup.sort_values(['hashtag_count'],ascending=False).head()

In [None]:
prediction_results = pd.merge(validation_rollup[['text', 'hashtag_list']], prob_prediction_df_wide, how = 'right', left_index=True, right_index=True, sort=False, copy=True)
prediction_results = prediction_results.drop(['index_col'], axis=1)
print(prediction_results.shape)
prediction_results