In [1]:
Check description to view diagrams

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans
import sqlite3
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot,plot
init_notebook_mode(connected=True)

In [2]:
con=sqlite3.connect('Clubhouse_Dataset_v1.db')
df=pd.read_sql_query('SELECT * FROM user',con)

print('Finding null values\n',df.isnull().sum())
print('\n\nInfo\n', df.dtypes)

Finding null values
 user_id                        0
name                           0
photo_url                  63878
username                       0
twitter                        0
instagram                      0
num_followers                  0
num_following                  0
time_created                   0
invited_by_user_profile        0
dtype: int64


Info
 user_id                     int64
name                       object
photo_url                  object
username                   object
twitter                    object
instagram                  object
num_followers               int64
num_following               int64
time_created               object
invited_by_user_profile    object
dtype: object


In [3]:
# finding new users created
df['time_created'] = pd.to_datetime(df['time_created']).apply(lambda x: x.date())
date_df = pd.DataFrame(df.groupby('time_created')['user_id'].apply(list))

for i in range(date_df.shape[0]):
    date_df['user_id'].iloc[i] = len(date_df['user_id'].iloc[i])
date_df.reset_index(level=0,inplace=True)

trace1 = go.Scatter(
                    x = date_df['time_created'],
                    y = date_df['user_id'],
                    mode = "lines",
                    name = "citations",
                    marker = dict(color = 'blue')
                    )

data = [trace1]
layout = dict(title = 'New Users Created / Day',
              xaxis= dict(title= 'Number of New Users',ticklen= 5,zeroline= False)
             )
fig = dict(data = data, layout = layout)
iplot(fig)

In [4]:
# top users who invited other users

invited_df = df['invited_by_user_profile'].value_counts().to_frame()
invited_df.reset_index(level=0,inplace=True)
invited_df.columns = ['user_id','no_of_users_invited']
invited_df = invited_df.head(10)
invited_df['name'] = ['']*10
for i in range(invited_df.shape[0]):
    user_name = df.loc[df['user_id']==invited_df['user_id'].iloc[i]]['name'].values
    if len(user_name) !=0:
        invited_df.loc[i,'name'] = user_name[0]
    else:
        if invited_df['user_id'].iloc[i] == '':
            invited_df.loc[i,'name'] = 'No Inviter'
        else:
            invited_df.loc[i,'name'] = invited_df['user_id'].iloc[i]
            
trace1 = go.Bar(
                x = invited_df.name,
                y = invited_df.no_of_users_invited,
                name = "citations",
                marker = dict(color = 'orange',
                             line=dict(color='black'))
                )
data = [trace1]
layout = dict(title = 'Top 10 Users Who Invited Others',
              xaxis= dict(title= 'Username',ticklen= 5,zeroline= False)
             )
fig = go.Figure(data = data, layout = layout)
iplot(fig)


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



In [5]:
followers_following_df = df[['num_followers','num_following']]

# dividing users by number of followers they have; >1mil, 1mil-100K, 100K-10K, 10K-5K

g1m_followers = followers_following_df.loc[(followers_following_df['num_followers']>1000000)].sort_values(['num_followers'],ascending=False)
g1m_followers['name'] = df['name'].iloc[g1m_followers.index]
g1m_followers.reset_index(level=0,inplace=True)

b1m_followers = followers_following_df.loc[(followers_following_df['num_followers']>100000) & (followers_following_df['num_followers']<1000000)].sort_values(['num_followers'],ascending=False)
b1m_followers['name'] = df['name'].iloc[b1m_followers.index]
b1m_followers.reset_index(level=0,inplace=True)
b1m_followers

less_lm_followers = followers_following_df.loc[(followers_following_df['num_followers']>10000) & (followers_following_df['num_followers']<100000)].sort_values(['num_followers'],ascending=False)
less_lm_followers['name'] = df['name'].iloc[less_lm_followers.index]
less_lm_followers.reset_index(level=0,inplace=True)

less_l0k_followers = followers_following_df.loc[(followers_following_df['num_followers']>5000) & (followers_following_df['num_followers']<10000)].sort_values(['num_followers'],ascending=False)
less_l0k_followers['name'] = df['name'].iloc[less_l0k_followers.index]
less_l0k_followers.reset_index(level=0,inplace=True)

trace1 =go.Scatter(
                    x = g1m_followers.num_following,
                    y = g1m_followers.num_followers,
                    mode = "markers",
                    name = "> 1Mil Followers",
                    marker = dict(color = 'rgba(162, 135, 250, 0.8)'),
                    text= g1m_followers.name)

trace2 =go.Scatter(
                    x = b1m_followers.num_following,
                    y = b1m_followers.num_followers,
                    mode = "markers",
                    name = "1Mil - 100K Followers",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text= b1m_followers.name)

trace3 =go.Scatter(
                    x = less_lm_followers.num_following,
                    y = less_lm_followers.num_followers,
                    mode = "markers",
                    name = "100K - 10K Followers",
                    marker = dict(color = 'rgba(80, 28, 246, 0.8)'),
                    text= less_lm_followers.name)

trace4 =go.Scatter(
                    x = less_l0k_followers.num_following,
                    y = less_l0k_followers.num_followers,
                    mode = "markers",
                    name = "10K - 5K Followers",
                    marker = dict(color = 'rgba(121, 249, 18, 0.8)'),
                    text= less_lm_followers.name)

data = [trace1,trace2,trace3,trace4]
layout = dict(title = 'Users Seperated to Four clusters based on Number of Followers',
              xaxis= dict(title= 'Number Following',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'Number of Followers',ticklen= 5,zeroline= False)
             )
fig = dict(data = data, layout = layout)
iplot(fig)

In [187]:
# Checking Twiiter accounts

percent_class_1_with_twitter_accounts = ((df.loc[g1m_followers.index]['twitter'].nunique()-1)/(df.loc[g1m_followers.index]['twitter'].shape[0]))*100

class_1_dict = {'Account':['Have Twitter Accounts','No Twitter Accounts'],
              'Count':[percent_class_1_with_twitter_accounts,100-percent_class_1_with_twitter_accounts]
             }
class_1_df = pd.DataFrame.from_dict(class_1_dict)


percent_class_2_with_twitter_accounts = ((df.loc[b1m_followers.index]['twitter'].nunique()-1)/(df.loc[b1m_followers.index]['twitter'].shape[0]))*100

class_2_dict = {'Account':['Have Twitter Accounts','No Twitter Accounts'],
              'Count':[percent_class_2_with_twitter_accounts,100-percent_class_2_with_twitter_accounts]
             }
class_2_df = pd.DataFrame.from_dict(class_2_dict)



percent_class_3_with_twitter_accounts = ((df.loc[less_lm_followers.index]['twitter'].nunique()-1)/(df.loc[less_lm_followers.index]['twitter'].shape[0]))*100

class_3_dict = {'Account':['Have Twitter Accounts','No Twitter Accounts'],
              'Count':[percent_class_3_with_twitter_accounts,100-percent_class_3_with_twitter_accounts]
             }
class_3_df = pd.DataFrame.from_dict(class_3_dict)



fig = make_subplots(rows=2, cols=2,specs=[[{"type": "pie"},{"type": "pie"}],[{"type": "pie"},{"type": "pie"}]])
fig.add_trace(go.Pie(labels=class_1_df.Account, values=class_1_df.Count, name="Class-1"),
              1, 1)
fig.add_trace(go.Pie(labels=class_2_df.Account, values=class_2_df.Count, name="Class-2"),
              1, 2)
fig.add_trace(go.Pie(labels=class_3_df.Account, values=class_3_df.Count, name="Class-3"),
              2, 1)


fig.update_traces(hole=.4)

fig.update_layout(
    title_text="Twitter Account Percentage Based on Clusters",
    annotations=[dict(text='> 1 Mil', x=0.19, y=0.5, font_size=10, showarrow=False),
                 dict(text='1 Mil - 100K', x=0.83, y=0.5, font_size=10, showarrow=False),
                dict(text='100K - 10K', x=0.19, y=-0.13, font_size=10, showarrow=False)])
fig.show()

In [186]:
# Chekcing Instagram Accounts

percent_class_1_with_instagram_accounts = ((df.loc[g1m_followers.index]['instagram'].nunique()-1)/(df.loc[g1m_followers.index]['instagram'].shape[0]))*100

class_1_dict = {'Account':['Have Instagram Accounts','No Instagram Accounts'],
              'Count':[percent_class_1_with_instagram_accounts,100-percent_class_1_with_instagram_accounts]
             }
class_1_df = pd.DataFrame.from_dict(class_1_dict)


percent_class_2_with_instagram_accounts = ((df.loc[b1m_followers.index]['instagram'].nunique()-1)/(df.loc[b1m_followers.index]['instagram'].shape[0]))*100

class_2_dict = {'Account':['Have Instagram Accounts','No Instagram Accounts'],
              'Count':[percent_class_2_with_instagram_accounts,100-percent_class_2_with_instagram_accounts]
             }
class_2_df = pd.DataFrame.from_dict(class_2_dict)



percent_class_3_with_instagram_accounts = ((df.loc[less_lm_followers.index]['instagram'].nunique()-1)/(df.loc[less_lm_followers.index]['instagram'].shape[0]))*100

class_3_dict = {'Account':['Have Instagram Accounts','No Instagram Accounts'],
              'Count':[percent_class_3_with_instagram_accounts,100-percent_class_3_with_instagram_accounts]
             }
class_3_df = pd.DataFrame.from_dict(class_3_dict)



fig = make_subplots(rows=2, cols=2,specs=[[{"type": "pie"},{"type": "pie"}],[{"type": "pie"},{"type": "pie"}]])
fig.add_trace(go.Pie(labels=class_1_df.Account, values=class_1_df.Count, name="Class-1"),
              1, 1)
fig.add_trace(go.Pie(labels=class_2_df.Account, values=class_2_df.Count, name="Class-2"),
              1, 2)
fig.add_trace(go.Pie(labels=class_3_df.Account, values=class_3_df.Count, name="Class-3"),
              2, 1)


fig.update_traces(hole=.4)

fig.update_layout(
    title_text="Instagram Account Percentage Based on Clusters",
    annotations=[dict(text='> 1 Mil', x=0.19, y=0.5, font_size=10, showarrow=False),
                 dict(text='1 Mil - 100K', x=0.83, y=0.5, font_size=10, showarrow=False),
                dict(text='100K - 10K', x=0.19, y=-0.13, font_size=10, showarrow=False)])
fig.show()
