In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE

from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet 

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import gensim

from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.exceptions import NotFound

[nltk_data] Downloading package wordnet to /home/traffic/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/traffic/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
# authenticate with BigQuery API
credentials = service_account.Credentials.from_service_account_file(
    '../../datascience-abovezero-58d98dcf7f65.json')
client = bigquery.Client.from_service_account_json(
    '../../datascience-abovezero-58d98dcf7f65.json')
# Perform a query.
QUERY = ('SELECT * FROM `datascience-abovezero.ml_sandbox.chegg_influencers_comments`')
query_job = client.query(QUERY)  # API request
df =  query_job.result().to_dataframe() #transform to Pandas Dataframe

#df2 = pd.read_csv('analisis_comments_tiktok.csv')# Parameters tuning using Grid Search

# Bag Of Words

### Preprocessing 

In [10]:
#def spelling_correcter(text):
    
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

comments = df.comment.tolist()
comments_prep = [preprocess(str(comment)) for comment in comments]
comments_dict = gensim.corpora.Dictionary(comments_prep)
bow_corpus = [comments_dict.doc2bow(doc) for doc in comments_prep]

# TFIDF 

### Preprocessing

In [11]:
#docs_raw = df.comment.fillna('').tolist()

tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 0.01)
#dtm_tf = tf_vectorizer.fit_transform(docs_raw)
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
#dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)

### Training The Model 

In [12]:
def get_top_topics(model, vectorizer, topn=5):
    names = tf_vectorizer.get_feature_names()
    res = {}
    for i_cluster, cluster in enumerate(model.components_):
        res[i_cluster]=[]
        for i_feature in cluster.argsort()[:-topn - 1:-1]:
            res[i_cluster] = res[i_cluster] + [names[i_feature]]
    return res

In [13]:
#first get topics for all comments (Full Dataset)
docs_raw_full = df.comment.fillna('').tolist()
dtm_tf_full = tf_vectorizer.fit_transform(docs_raw_full)
# train model to find topics per influencer
lda_tf_full = LatentDirichletAllocation(n_components=4, random_state=0)
lda_tf_full.fit(dtm_tf_full)
# extract most important topics
topics_full =  get_top_topics(lda_tf_full, tf_vectorizer, topn=5)
print(topics_full)

X_test = tf_vectorizer.transform(df.loc[:,'comment'])

doc_topic_dist_unnormalized_full = np.matrix(lda_tf_full.transform(X_test))



{0: ['school', 'thank', 'quot', 'like', 'high'], 1: ['cancer', 'colon', 'youtube', 'amp', 'literally'], 2: ['love', 'cancer', 'simone', 'colon', 'year'], 3: ['video', 'life', 'just', 'like', 'time']}


In [61]:
#first get topics for all comments (Full Dataset)
docs_raw_full = df.comment.fillna('').tolist()
dtm_tf_full = tf_vectorizer.fit_transform(docs_raw_full)
# train model to find topics per influencer
lda_tf_full = LatentDirichletAllocation(n_components=4, random_state=0)
lda_tf_full.fit(dtm_tf_full)
# extract most important topics
topics_full =  get_top_topics(lda_tf_full, tf_vectorizer, topn=5)
print(topics_full)

#now let's what of those topics are talked about by what influencer
# and then go more granular and look at topics on influencer level.
topic_mapping = pd.DataFrame()
res_infl = pd.DataFrame({'influencer':[]})
for topic in topics_full:
    topic_mapping.loc[:,topic] = np.array([str(topics_full[topic])])
    res_infl[topic] = []
    
for topic in topics_full:
    res_infl['likesrel_'+str(topic)] = []
for topic in topics_full:
    res_infl['likestot_'+str(topic)] = []

for i_infl, infl in enumerate(df.influencer.unique()):
    dfi = df.loc[df.influencer==infl,:]
    #what categories of full topic model do infl comments belong to
    X_test = tf_vectorizer.transform(dfi.loc[:,'comment'])
    doc_topic_dist_unnormalized_full = np.matrix(lda_tf_full.transform(X_test))
    # get count of number topics are 'hit'
    res_ = pd.DataFrame({'topic':list(topics_full.keys())})
    display(res_.head())
    res__ = pd.DataFrame(doc_topic_dist_unnormalized_full.argmax(axis=1)).value_counts().rename_axis('topic').reset_index(name='counts')
    res_ = res_.merge(res__, on='topic', how='left').reset_index(drop=True).fillna(0)
    
    # calculate relative topic distribution for influencer
    counts_sum = res_.counts.sum()
    res_.loc[:,'counts'] = res_.counts.apply(lambda x:round(x/counts_sum*100,1))
    res_ = res_.sort_values('topic',ascending=True).reset_index(drop=True)
    display([infl] + list(res_.counts))
    
    l_likecount_tot = len(topics_full)*[0]
    #l_likecount_rel = len(topics_full)*[0]
    total_likecount = dfi.comment_likecount.sum()
    for comment, comment_orig, comment_likes in zip(doc_topic_dist_unnormalized_full, dfi.comment, dfi.comment_likecount):
        print(comment_orig)
        print(topics_full[comment[0].argmax()])
        l_likecount_tot[comment[0].argmax()]+=comment_likes
    l_likecount_rel = [v/total_likecount for v in l_likecount_tot]
    print(l_likecount_rel + l_likecount_tot)
    res_infl.loc[len(res_infl)] = [infl] + list(res_.counts) + l_likecount_rel + l_likecount_tot
 

{0: ['school', 'thank', 'quot', 'like', 'high'], 1: ['cancer', 'colon', 'youtube', 'amp', 'literally'], 2: ['love', 'cancer', 'simone', 'colon', 'year'], 3: ['video', 'life', 'just', 'like', 'time']}




Unnamed: 0,topic
0,0
1,1
2,2
3,3


['itssozer', 59.2, 8.2, 14.3, 18.4]

Ayyyy legend
['school', 'thank', 'quot', 'like', 'high']
Are you serious
['school', 'thank', 'quot', 'like', 'high']
Yes it's college but it's called university
['school', 'thank', 'quot', 'like', 'high']
Your such an amazing person 😁
['love', 'cancer', 'simone', 'colon', 'year']
if i got chegg as a gift i would fucking die @Antedragz
['school', 'thank', 'quot', 'like', 'high']
That’s a different thing in my country tho
['school', 'thank', 'quot', 'like', 'high']
：Ce gars est une legende 👏
['school', 'thank', 'quot', 'like', 'high']
Omg your so nice
['love', 'cancer', 'simone', 'colon', 'year']
What a gift
['school', 'thank', 'quot', 'like', 'high']
HEAVEN NOW
['school', 'thank', 'quot', 'like', 'high']
I love your page @itssozer 🥰😁
['love', 'cancer', 'simone', 'colon', 'year']
Why would anyone ever hurt you? Your the nicest person I know🥰😁
['school', 'thank', 'quot', 'like', 'high']
hii I have CRPS a very rare disease and my mum dose not have enough money to pay for my medical bills
[

Unnamed: 0,topic
0,0
1,1
2,2
3,3


['emilyballz', 73.7, 5.3, 15.8, 5.3]

Hi
['school', 'thank', 'quot', 'like', 'high']
it's just to much effort and I hate school
['school', 'thank', 'quot', 'like', 'high']
I’ve been following for a year! I’m making you a go now!
['love', 'cancer', 'simone', 'colon', 'year']
Hi
['school', 'thank', 'quot', 'like', 'high']
Emily are you single?
['school', 'thank', 'quot', 'like', 'high']
Ty for replying 😊
['school', 'thank', 'quot', 'like', 'high']
Ya
['school', 'thank', 'quot', 'like', 'high']
Hi,I am really interested in you I would like us to work something out ,am ready to give you 200bucks
['video', 'life', 'just', 'like', 'time']
Hi
['school', 'thank', 'quot', 'like', 'high']
Damn
['school', 'thank', 'quot', 'like', 'high']
Hi!
['school', 'thank', 'quot', 'like', 'high']
Quizlet is also mad helpfully too and free
['cancer', 'colon', 'youtube', 'amp', 'literally']
I'm to lazy to do homework
['school', 'thank', 'quot', 'like', 'high']
True but not the same as Chegg:) Chegg can help teach you where quizlets is for flash ca

In [66]:
#first get topics for all comments (Full Dataset)
docs_raw_full = df.comment.fillna('').tolist()
dtm_tf_full = tf_vectorizer.fit_transform(docs_raw_full)
# train model to find topics per influencer
lda_tf_full = LatentDirichletAllocation(n_components=4, random_state=0)
lda_tf_full.fit(dtm_tf_full)
# extract most important topics
topics_full =  get_top_topics(lda_tf_full, tf_vectorizer, topn=5)
print(topics_full)

#now let's what of those topics are talked about by what influencer
# and then go more granular and look at topics on influencer level.
topic_mapping = pd.DataFrame()
res_infl = pd.DataFrame({'influencer':[]})
for topic in topics_full:
    topic_mapping.loc[:,topic] = np.array([str(topics_full[topic])])
    res_infl[topic] = []
    
for topic in topics_full:
    res_infl['likesrel_'+str(topic)] = []
for topic in topics_full:
    res_infl['likestot_'+str(topic)] = []

for i_infl, infl in enumerate(df.influencer.unique()):
    dfi = df.loc[df.influencer==infl,:]
    #what categories of full topic model do infl comments belong to
    X_test = tf_vectorizer.transform(dfi.loc[:,'comment'])
    doc_topic_dist_unnormalized_full = np.matrix(lda_tf_full.transform(X_test))
    # get count of number topics are 'hit'
    res_ = pd.DataFrame({'topic':list(topics_full.keys())})
    #display(res_.head())
    res__ = pd.DataFrame(doc_topic_dist_unnormalized_full.argmax(axis=1)).value_counts().rename_axis('topic').reset_index(name='counts')
    res_ = res_.merge(res__, on='topic', how='left').reset_index(drop=True).fillna(0)
    
    # calculate relative topic distribution for influencer
    counts_sum = res_.counts.sum()
    res_.loc[:,'counts'] = res_.counts.apply(lambda x:round(x/counts_sum*100,1))
    res_ = res_.sort_values('topic',ascending=True).reset_index(drop=True)
    #display([infl] + list(res_.counts))
    
    l_likecount_tot = len(topics_full)*[0]
    #l_likecount_rel = len(topics_full)*[0]
    total_likecount = dfi.comment_likecount.sum()
    for comment, comment_orig, comment_likes in zip(doc_topic_dist_unnormalized_full, dfi.comment, dfi.comment_likecount):
        #print(comment_orig)
        #print(topics_full[comment[0].argmax()])
        l_likecount_tot[comment[0].argmax()]+=comment_likes
    l_likecount_rel = [v/total_likecount for v in l_likecount_tot]

    res_infl.loc[len(res_infl)] = [infl] + list(res_.counts) + l_likecount_rel + l_likecount_tot
        
        
    """  
    docs_raw = dfi.comment.fillna('').tolist()
    dtm_tf = tf_vectorizer.fit_transform(docs_raw)
    # train model to find topics per influencer
    lda_tf = LatentDirichletAllocation(n_components=4, random_state=0)
    lda_tf.fit(dtm_tf)d
    # extract most important topics
    topics =  get_top_topics(lda_tf, tf_vectorizer, topn=5)
    print(infl)
    print(topics)
    
    doc_topic_dist_unnormalized = np.matrix(lda_tf.transform(X_test))
    for comment, comment_orig in zip(doc_topic_dist_unnormalized, dfi.comment):
        print(comment_orig)
        print(topics[comment[0].argmax()])
    """
    #if i_infl>-1:
    #    break



{0: ['school', 'thank', 'quot', 'like', 'high'], 1: ['cancer', 'colon', 'youtube', 'amp', 'literally'], 2: ['love', 'cancer', 'simone', 'colon', 'year'], 3: ['video', 'life', 'just', 'like', 'time']}


  l_likecount_rel = [v/total_likecount for v in l_likecount_tot]
  l_likecount_rel = [v/total_likecount for v in l_likecount_tot]
  l_likecount_rel = [v/total_likecount for v in l_likecount_tot]
  l_likecount_rel = [v/total_likecount for v in l_likecount_tot]
  l_likecount_rel = [v/total_likecount for v in l_likecount_tot]
  l_likecount_rel = [v/total_likecount for v in l_likecount_tot]


In [68]:
res_infl.head()

Unnamed: 0,influencer,0,1,2,3,likesrel_0,likesrel_1,likesrel_2,likesrel_3,likestot_0,likestot_1,likestot_2,likestot_3
0,itssozer,59.2,8.2,14.3,18.4,0.722222,0.006944,0.243056,0.027778,104,1,35,4
1,emilyballz,73.7,5.3,15.8,5.3,0.789474,0.157895,0.0,0.052632,15,3,0,1
2,iamalilstitious,50.0,0.0,37.5,12.5,0.666667,0.0,0.333333,0.0,2,0,1,0
3,sarahbada_,73.7,5.3,15.8,5.3,0.913043,0.0,0.086957,0.0,21,0,2,0
4,sakshammagic,73.1,11.5,3.8,11.5,0.761905,0.142857,0.0,0.095238,16,3,0,2


In [70]:
def save_results(df, table_id="datascience-abovezero.ml_sandbox.chegg_influencers_comments_results"):
    # create BigQuery table if it doesn't already exist
    try: 
        client.get_table(table_id)  # Make an API request.
        print("Table {} already exists.".format(table_id))
    except NotFound:
        print(f"Table {table_id} created.")
        schema = [
            bigquery.SchemaField("influencer", "STRING", mode="REQUIRED"),
            bigquery.SchemaField('_'.join(['topic0'] + topics_full[0]), "FLOAT64", mode="REQUIRED"),
            bigquery.SchemaField('_'.join(['topic1'] + topics_full[1]), "FLOAT64", mode="REQUIRED"),
            bigquery.SchemaField('_'.join(['topic2'] + topics_full[2]), "FLOAT64", mode="REQUIRED"),
            bigquery.SchemaField('_'.join(['topic3'] + topics_full[3]), "FLOAT64", mode="REQUIRED"),
            bigquery.SchemaField('likesrel0', "FLOAT64", mode="REQUIRED"),
            bigquery.SchemaField('likesrel1', "FLOAT64", mode="REQUIRED"),
            bigquery.SchemaField('likesrel2', "FLOAT64", mode="REQUIRED"),
            bigquery.SchemaField('likesrel3', "FLOAT64", mode="REQUIRED"),
            bigquery.SchemaField('likestot0', "FLOAT64", mode="REQUIRED"),
            bigquery.SchemaField('likestot1', "FLOAT64", mode="REQUIRED"),
            bigquery.SchemaField('likestot2', "FLOAT64", mode="REQUIRED"),
            bigquery.SchemaField('likestot3', "FLOAT64", mode="REQUIRED"),
        ]
        table = bigquery.Table(table_id, schema=schema)
        table = client.create_table(table)  # Make an API request.
        #print(
        #    "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
        #)
    df.columns = ['influencer', '_'.join(['topic']+topics_full[0]),'_'.join(['topic']+topics_full[1]),'_'.join(['topic']+topics_full[2]),'_'.join(['topic']+topics_full[3]),
                 'likesrel0','likesrel1','likesrel2','likesrel3','likestot0','likestot1','likestot2','likestot3']     
    df.to_gbq('.'.join(table_id.split('.')[1:]), project_id=table_id.split('.')[0], if_exists='replace',#'append',
          chunksize=10000, progress_bar=True, credentials=credentials)
    
    
save_results(res_infl)

Table datascience-abovezero.ml_sandbox.chegg_influencers_comments_results already exists.


  pandas_gbq.to_gbq(
100%|████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 9467.95it/s]


In [30]:
res_infl.head()

Unnamed: 0,influencer,topic_school_thank_quot_like_high,topic_cancer_colon_youtube_amp_literally,topic_love_cancer_simone_colon_year,topic_video_life_just_like_time
0,itssozer,59.2,8.2,14.3,18.4
1,emilyballz,73.7,5.3,15.8,5.3
2,iamalilstitious,50.0,0.0,37.5,12.5
3,sarahbada_,73.7,5.3,15.8,5.3
4,sakshammagic,73.1,11.5,3.8,11.5
