In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE

from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet 

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import gensim

from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.exceptions import NotFound

[nltk_data] Downloading package wordnet to /home/traffic/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/traffic/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [32]:
# authenticate with BigQuery API
credentials = service_account.Credentials.from_service_account_file(
    '../../datascience-abovezero-58d98dcf7f65.json')
client = bigquery.Client.from_service_account_json(
    '../../datascience-abovezero-58d98dcf7f65.json')
# Perform a query.
QUERY = ('SELECT * FROM `datascience-abovezero.ml_sandbox.chegg_influencers_comments`')
query_job = client.query(QUERY)  # API request
df =  query_job.result().to_dataframe() #transform to Pandas Dataframe

#df2 = pd.read_csv('analisis_comments_tiktok.csv')# Parameters tuning using Grid Search

# Bag Of Words

### Preprocessing 

In [16]:
#def spelling_correcter(text):
    
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

comments = df.comment.tolist()
comments_prep = [preprocess(str(comment)) for comment in comments]
comments_dict = gensim.corpora.Dictionary(comments_prep)
bow_corpus = [comments_dict.doc2bow(doc) for doc in comments_prep]

# TFIDF 

### Preprocessing

In [5]:
#docs_raw = df.comment.fillna('').tolist()

tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 0.01)
#dtm_tf = tf_vectorizer.fit_transform(docs_raw)
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
#dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)

### Training The Model 

In [6]:
def get_top_topics(model, vectorizer, topn=5):
    names = tf_vectorizer.get_feature_names()
    res = {}
    for i_cluster, cluster in enumerate(model.components_):
        res[i_cluster]=[]
        for i_feature in cluster.argsort()[:-topn - 1:-1]:
            res[i_cluster] = res[i_cluster] + [names[i_feature]]
    return res

In [7]:
#first get topics for all comments (Full Dataset)
docs_raw_full = df.comment.fillna('').tolist()
dtm_tf_full = tf_vectorizer.fit_transform(docs_raw_full)
# train model to find topics per influencer
lda_tf_full = LatentDirichletAllocation(n_components=4, random_state=0)
lda_tf_full.fit(dtm_tf_full)
# extract most important topics
topics_full =  get_top_topics(lda_tf_full, tf_vectorizer, topn=5)
print(topics_full)

X_test = tf_vectorizer.transform(df.loc[:,'comment'])

doc_topic_dist_unnormalized_full = np.matrix(lda_tf_full.transform(X_test))



{0: ['just', 'people', 'good', 'college', 'chegg'], 1: ['cancer', 'school', 'colon', 'high', 'year'], 2: ['love', 'video', 'thank', 'simone', 'youtube'], 3: ['quot', 'don', 'time', 'like', 'cancer']}


In [8]:
#first get topics for all comments (Full Dataset)
docs_raw_full = df.comment.fillna('').tolist()
dtm_tf_full = tf_vectorizer.fit_transform(docs_raw_full)
# train model to find topics per influencer
lda_tf_full = LatentDirichletAllocation(n_components=4, random_state=0)
lda_tf_full.fit(dtm_tf_full)
# extract most important topics
topics_full =  get_top_topics(lda_tf_full, tf_vectorizer, topn=5)
print(topics_full)

#now let's what of those topics are talked about by what influencer
# and then go more granular and look at topics on influencer level.
topic_mapping = pd.DataFrame()
res_infl = pd.DataFrame({'influencer':[]})
for topic in topics_full:
    topic_mapping.loc[:,topic] = np.array([str(topics_full[topic])])
    res_infl[topic] = []

for i_infl, infl in enumerate(df.influencer.unique()):
    dfi = df.loc[df.influencer==infl,:]
    #what categories of full topic model do infl comments belong to
    X_test = tf_vectorizer.transform(dfi.loc[:,'comment'])
    doc_topic_dist_unnormalized_full = np.matrix(lda_tf_full.transform(X_test))
    # get count of number topics are 'hit'
    res_ = pd.DataFrame({'topic':list(topics_full.keys())})
    res__ = pd.DataFrame(doc_topic_dist_unnormalized_full.argmax(axis=1)).value_counts().rename_axis('topic').reset_index(name='counts')
    res_ = res_.merge(res__, on='topic', how='left').reset_index(drop=True).fillna(0)
    
    # calculate relative topic distribution for influencer
    counts_sum = res_.counts.sum()
    res_.loc[:,'counts'] = res_.counts.apply(lambda x:round(x/counts_sum*100,1))
    res_ = res_.sort_values('topic',ascending=True).reset_index(drop=True)
    display([infl] + list(res_.counts))
    res_infl.loc[len(res_infl)] = [infl] + list(res_.counts)
    #for comment, comment_orig in zip(doc_topic_dist_unnormalized_full, dfi.comment):
    #    print(comment_orig)
    #    print(topics[comment[0].argmax()])
        
        
    """  
    docs_raw = dfi.comment.fillna('').tolist()
    dtm_tf = tf_vectorizer.fit_transform(docs_raw)
    # train model to find topics per influencer
    lda_tf = LatentDirichletAllocation(n_components=4, random_state=0)
    lda_tf.fit(dtm_tf)
    # extract most important topics
    topics =  get_top_topics(lda_tf, tf_vectorizer, topn=5)
    print(infl)
    print(topics)
    
    doc_topic_dist_unnormalized = np.matrix(lda_tf.transform(X_test))
    for comment, comment_orig in zip(doc_topic_dist_unnormalized, dfi.comment):
        print(comment_orig)
        print(topics[comment[0].argmax()])
    """
    #if i_infl>-1:
    #    break

{0: ['just', 'people', 'good', 'college', 'chegg'], 1: ['cancer', 'school', 'colon', 'high', 'year'], 2: ['love', 'video', 'thank', 'simone', 'youtube'], 3: ['quot', 'don', 'time', 'like', 'cancer']}


Unnamed: 0,topic,counts
0,0,38
1,1,5
2,2,4
3,3,2


['itssozer', 77.6, 10.2, 8.2, 4.1]

Unnamed: 0,topic,counts
0,0,15.0
1,1,2.0
2,2,2.0
3,3,0.0


['emilyballz', 78.9, 10.5, 10.5, 0.0]

Unnamed: 0,topic,counts
0,0,3.0
1,1,0.0
2,2,1.0
3,3,1.0


['iamalilstitious', 60.0, 0.0, 20.0, 20.0]

Unnamed: 0,topic,counts
0,0,15
1,1,2
2,2,1
3,3,1


['sarahbada_', 78.9, 10.5, 5.3, 5.3]

Unnamed: 0,topic,counts
0,0,21.0
1,1,0.0
2,2,2.0
3,3,3.0


['sakshammagic', 80.8, 0.0, 7.7, 11.5]

Unnamed: 0,topic,counts
0,0,54
1,1,8
2,2,6
3,3,7


['cloutom', 72.0, 10.7, 8.0, 9.3]

Unnamed: 0,topic,counts
0,0,3.0
1,1,0.0
2,2,0.0
3,3,2.0


['medstudebt', 60.0, 0.0, 0.0, 40.0]

Unnamed: 0,topic,counts
0,0,9.0
1,1,0.0
2,2,1.0
3,3,0.0


['j0siahyoung', 90.0, 0.0, 10.0, 0.0]

Unnamed: 0,topic,counts
0,0,42
1,1,4
2,2,16
3,3,3


['modern.day.classic', 64.6, 6.2, 24.6, 4.6]

Unnamed: 0,topic,counts
0,0,32
1,1,2
2,2,4
3,3,4


['sarahrav', 76.2, 4.8, 9.5, 9.5]

Unnamed: 0,topic,counts
0,0,65
1,1,3
2,2,8
3,3,24


['ryanmarksutherland', 65.0, 3.0, 8.0, 24.0]

Unnamed: 0,topic,counts
0,0,115
1,1,13
2,2,6
3,3,6


['themccartys', 82.1, 9.3, 4.3, 4.3]

Unnamed: 0,topic,counts
0,0,23
1,1,3
2,2,1
3,3,5


['espdaniella', 71.9, 9.4, 3.1, 15.6]

Unnamed: 0,topic,counts
0,0,75
1,1,3
2,2,7
3,3,9


['thatrelatablestudent', 79.8, 3.2, 7.4, 9.6]

Unnamed: 0,topic,counts
0,0,42
1,1,1
2,2,19
3,3,11


['niki_patton', 57.5, 1.4, 26.0, 15.1]

Unnamed: 0,topic,counts
0,0,34
1,1,5
2,2,4
3,3,3


['thelawerangela', 73.9, 10.9, 8.7, 6.5]

Unnamed: 0,topic,counts
0,0,2.0
1,1,0.0
2,2,0.0
3,3,1.0


['lifeincollege', 66.7, 0.0, 0.0, 33.3]

Unnamed: 0,topic,counts
0,0,35
1,1,4
2,2,6
3,3,4


['rrogersworld', 71.4, 8.2, 12.2, 8.2]

Unnamed: 0,topic,counts
0,0,99
1,1,6
2,2,14
3,3,16


['lexistrechak', 73.3, 4.4, 10.4, 11.9]

Unnamed: 0,topic,counts
0,0,8.0
1,1,0.0
2,2,0.0
3,3,1.0


['maddyspencer5', 88.9, 0.0, 0.0, 11.1]

Unnamed: 0,topic,counts
0,0,4.0
1,1,0.0
2,2,0.0
3,3,0.0


['rollewitit_math', 100.0, 0.0, 0.0, 0.0]

Unnamed: 0,topic,counts
0,0,10.0
1,1,1.0
2,2,0.0
3,3,0.0


['fernsulantay', 90.9, 9.1, 0.0, 0.0]

Unnamed: 0,topic,counts
0,0,123
1,1,12
2,2,72
3,3,44


['sydneyserena', 49.0, 4.8, 28.7, 17.5]

Unnamed: 0,topic,counts
0,0,186
1,1,264
2,2,79
3,3,293


['JJ Medicine', 22.6, 32.1, 9.6, 35.6]

Unnamed: 0,topic,counts
0,0,54
1,1,15
2,2,51
3,3,10


['ninjanerdscience', 41.5, 11.5, 39.2, 7.7]

Unnamed: 0,topic,counts
0,0,1.0
1,1,1.0
2,2,0.0
3,3,0.0


['pacollective', 50.0, 50.0, 0.0, 0.0]

Unnamed: 0,topic,counts
0,0,54
1,1,5
2,2,35
3,3,4


['NinjaNerdScience', 55.1, 5.1, 35.7, 4.1]

Unnamed: 0,topic,counts
0,0,356
1,1,182
2,2,280
3,3,86


['Sydney Serena', 39.4, 20.1, 31.0, 9.5]

Unnamed: 0,topic,counts
0,0,37
1,1,2
2,2,3
3,3,4


['markiedoesmath', 80.4, 4.3, 6.5, 8.7]

Unnamed: 0,topic,counts
0,0,7
1,1,2
2,2,5
3,3,1


['thelawyerangela', 46.7, 13.3, 33.3, 6.7]

Unnamed: 0,topic,counts
0,0,72
1,1,2
2,2,10
3,3,9


['campuscrawl', 77.4, 2.2, 10.8, 9.7]

Unnamed: 0,topic,counts
0,0,29
1,1,4
2,2,3
3,3,4


['mathicallytutors', 72.5, 10.0, 7.5, 10.0]

Unnamed: 0,topic,counts
0,0,13.0
1,1,0.0
2,2,2.0
3,3,0.0


['leonthetutor', 86.7, 0.0, 13.3, 0.0]

In [38]:
def save_results(df, table_id="datascience-abovezero.ml_sandbox.chegg_influencers_comments_results"):
    # create BigQuery table if it doesn't already exist
    try: 
        client.get_table(table_id)  # Make an API request.
        print("Table {} already exists.".format(table_id))
    except NotFound:
        print(f"Table {table_id} created.")
        schema = [
            bigquery.SchemaField("influencer", "STRING", mode="REQUIRED"),
            bigquery.SchemaField("topic_0", "FLOAT64", mode="REQUIRED"),
            bigquery.SchemaField("topic_1", "FLOAT64", mode="REQUIRED"),
            bigquery.SchemaField("topic_2", "FLOAT64", mode="REQUIRED"),
            bigquery.SchemaField("topic_3", "FLOAT64", mode="REQUIRED"),
        ]
        table = bigquery.Table(table_id, schema=schema)
        table = client.create_table(table)  # Make an API request.
        #print(
        #    "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
        #)
    df.columns = ['influencer', 'topic_0','topic_1','topic_2','topic_3']     
    df.to_gbq('.'.join(table_id.split('.')[1:]), project_id=table_id.split('.')[0], if_exists='append',#'replace',
          chunksize=10000, progress_bar=True, credentials=credentials)
    
    
save_results(res_infl)

Table datascience-abovezero.ml_sandbox.chegg_influencers_comments_results already exists.


  pandas_gbq.to_gbq(
100%|██████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 5809.29it/s]
