# INSY 670 Group Project
Alice Liu, Diwei Zhu, Yingxin Jiang, Kexin Wang, Yichen Wang

# Part 1

## 1.  Predictive Model

### 1.1 Environment Setup

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler,Normalizer
!pip install decorator==5.0.9 

Collecting decorator==5.0.9
  Downloading decorator-5.0.9-py3-none-any.whl (8.9 kB)
Installing collected packages: decorator
  Attempting uninstall: decorator
    Found existing installation: decorator 5.1.0
    Uninstalling decorator-5.1.0:
      Successfully uninstalled decorator-5.1.0
Successfully installed decorator-5.0.9


### 1.2 Data Pre-processing

In [2]:
# load dataset via pandas
train = pd.read_csv('train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5500 entries, 0 to 5499
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Choice               5500 non-null   int64  
 1   A_follower_count     5500 non-null   int64  
 2   A_following_count    5500 non-null   int64  
 3   A_listed_count       5500 non-null   int64  
 4   A_mentions_received  5500 non-null   float64
 5   A_retweets_received  5500 non-null   float64
 6   A_mentions_sent      5500 non-null   float64
 7   A_retweets_sent      5500 non-null   float64
 8   A_posts              5500 non-null   float64
 9   A_network_feature_1  5500 non-null   int64  
 10  A_network_feature_2  5500 non-null   float64
 11  A_network_feature_3  5500 non-null   float64
 12  B_follower_count     5500 non-null   int64  
 13  B_following_count    5500 non-null   int64  
 14  B_listed_count       5500 non-null   int64  
 15  B_mentions_received  5500 non-null   f

In [3]:
# initiate dataframe containing columns of difference between A and B features (A-B)
df = pd.DataFrame(columns = ["A_B_follower_count","A_B_following_count","A_B_listed_count","A_B_mentions_received",
                "A_B_retweets_received","A_B_mentions_sent","A_B_retweets_sent","A_B_posts","A_B_network_feature_1",
                "A_B_network_feature_2","A_B_network_feature_3"])

In [4]:
# create dataframe for difference between A and B features
for i in range(11):
    series = train.iloc[:,i+1]-train.iloc[:,i+1+11]
    df.iloc[:,i] = series
df['Choice'] = train['Choice']
df

Unnamed: 0,A_B_follower_count,A_B_following_count,A_B_listed_count,A_B_mentions_received,A_B_retweets_received,A_B_mentions_sent,A_B_retweets_sent,A_B_posts,A_B_network_feature_1,A_B_network_feature_2,A_B_network_feature_3,Choice
0,-34235,-29506,-1686,-14.846518,-3.883525,-8.103828,-0.231920,-6.626665,-64,90.969697,9438.106061,0
1,-17671,331,-1382,49.961485,16.854685,2.481652,0.546816,4.106299,206,-113.587704,-1601.149290,0
2,3688,733,-105,24.768949,9.201969,4.758317,0.490702,2.986516,92,58.594502,5722.563574,0
3,-19542,-17630,-276,-565.184032,-390.016375,-26.220532,-7.067053,-29.271279,-1756,-21.469296,-1299.678967,0
4,38035,-849,2460,127.252413,33.417223,21.117111,2.213765,19.298035,466,78.904293,840.220036,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5495,-762629,-1185,-13787,-5351.839938,-4730.783795,54.126383,3.006633,86.017644,-20651,28.807054,2171.964468,0
5496,-682,-236,-3,1.139100,-0.205570,0.000000,0.000000,-1.018804,4,47.600000,-3554.800000,1
5497,7831,-1038,566,34.391680,18.740284,0.181576,-0.777790,1.940731,117,103.995098,1292.989740,0
5498,-57424,-37392,-21681,455.382230,415.318328,-1.388242,-1.244570,3.138261,1907,-230.511754,-6661.772353,0


### 1.3 Features Selection

In [5]:
# set predicting variables and outcomes
### 应该是 X = df.iloc[:,:11]
#X = df.iloc[:,:10]
X = df.iloc[:,:11]
y = df['Choice']

# Standardization
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [6]:
# Random Forest for features selection
randomforest = RandomForestClassifier(random_state=42)
model = randomforest.fit(X_std,y)
importance = pd.DataFrame(list(zip(X.columns,model.feature_importances_)),columns=
             ['predictor','feature importance'])
print(importance.sort_values(by=['feature importance'],ascending = False).head(5))

               predictor  feature importance
2       A_B_listed_count            0.182173
8  A_B_network_feature_1            0.123855
3  A_B_mentions_received            0.114590
0     A_B_follower_count            0.111455
4  A_B_retweets_received            0.087582


In [7]:
# Based on features selection result, keep the columns we need
df2 = df[['A_B_listed_count','A_B_network_feature_1','A_B_mentions_received','A_B_follower_count','A_B_retweets_received']]

### 1.4 Binary Classification

#### 1.4.1 Train Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_std,y,test_size=0.33,random_state=5)

#### 1.4.2 Logistic Regression

In [9]:
# Run the Model
lr = LogisticRegression()
model_lr = lr.fit(X_std,y)

# Calculate the accuracy score
y_test_pred = model_lr.predict(X_test)
print('Logistic Regression Accuracy Score:',accuracy_score(y_test,y_test_pred))

# Confusion Matrix
print('Logistic Regression Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test,y_test_pred,labels=[0,1]),index=['true:0','true:1'],columns=['pred:0','pred:1']))

Logistic Regression Accuracy Score: 0.7366391184573002
Logistic Regression Confusion Matrix:
        pred:0  pred:1
true:0     629     257
true:1     221     708


#### 1.4.3 Random Forest

In [10]:
# Run the Model
randomforest = RandomForestClassifier(random_state=42)
model_rf = randomforest.fit(X_train,y_train)

# Calculate the accuracy score
y_test_pred = model_rf.predict(X_test)
print('Random Forest Accuracy Score:',accuracy_score(y_test,y_test_pred))

# Confusion Matrix
print('Random Forest Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test,y_test_pred,labels=[0,1]),index=['true:0','true:1'],columns=['pred:0','pred:1']))

Random Forest Accuracy Score: 0.7619834710743801
Random Forest Confusion Matrix:
        pred:0  pred:1
true:0     675     211
true:1     221     708


#### 1.4.4 Gradient Boosting

In [11]:
# Run the Model
gbt = GradientBoostingClassifier(random_state=42)
model_gbt = gbt.fit(X_train,y_train)

# Calculate the accuracy score
y_test_pred = model_gbt.predict(X_test)
print('Gradient Boosting Accuracy Score:',accuracy_score(y_test,y_test_pred))

# Confusion Matrix
print('Gradient Boosting Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test,y_test_pred,labels=[0,1]),index=['true:0','true:1'],columns=['pred:0','pred:1']))

Gradient Boosting Accuracy Score: 0.7730027548209366
Gradient Boosting Confusion Matrix:
        pred:0  pred:1
true:0     672     214
true:1     198     731


According to accuracy score, we see that gradient boosting has the best performance.

### 1.5 Results Analysis

**1. Which factors are best predictors of influence?**<br>
According to the result of Random Forest's feature importance, we selected the top 5 factors: difference between A and B in 'A_B_listed_count','A_B_network_feature_1','A_B_mentions_received','A_B_follower_count', and 'A_B_retweets_received'.<br><br>

**2. Are there any surprises here?** <br>
We are not surprised by the result, because the more influential a person is on social media, the more interactive they become (mentions_received, retweets_received), the more popularity they gain (listed_count, follower_count).<br><br>

**3. How can a business use your model/results?** <br>
With our model, companies are able to find the most interactive and popular influencers among all users in Twitter to promote their products.

## 2. Financial value calculation

Influencers tweet twice: <br>
profit margin: PM_tweet twice = -\\$10 + \\$10\*0.03%\*followers <br>

Influencers tweet once: <br>
profit margin: PM_tweet once = -\\$5 + \\$10\*0.02%\*followers <br>

Normal users tweet once: <br> 
profit margin: PM_non influencer = -\\$5

### 2.1 expected net profit without model

Make every user tweet once

In [12]:
# mean values of followers
A_follower_avg = train["A_follower_count"].mean()
B_follower_avg = train["B_follower_count"].mean()

In [13]:
# expected profit calculation
cost1 = 8*(-5)
income_influencer1 = 2*A_follower_avg*0.0002*10 + 2*B_follower_avg*0.0002*10  # John, Sue, Sandy & Moe
income_non_influencer1 = 0  # Ted, Ron, Fred & Alex

expected_net_profit_without_model = cost1 + income_influencer1 + income_non_influencer1 

print("net profit without the model: $"+ str(expected_net_profit_without_model))

net profit without the model: $5301.483243636363


### 2.2 expected net profit with the perfect model

Precisely identify all influencers

In [14]:
cost2 = 4*(-10)
income_influencer2 = 2*A_follower_avg*0.0003*10 + 2*B_follower_avg*0.0003*10  # John, Sue, Sandy & Moe
income_non_influencer2 = 0  # Ted, Ron, Fred & Alex

expected_net_profit_with_perfect_model = cost2 + income_influencer2 + income_non_influencer2 

print("net profit with the perfect model: $"+ str(expected_net_profit_with_perfect_model))

net profit with the perfect model: $7972.224865454546


### 2.3 expected net profit with our model (accuracy = 0.7769)

~77% of the influencers would be successfully identified by our model.

In [15]:
accuracy = accuracy_score(y_test,y_test_pred)

In [16]:
accuracy

0.7730027548209366

In [17]:
cost3 = 4*(-10)*accuracy
income_influencer3 = 2*A_follower_avg*0.0003*10*accuracy + 2*B_follower_avg*0.0003*10*accuracy
income_non_influencer3 = 0

expected_net_profit_with_our_model = cost3 + income_influencer3 + income_non_influencer3 

print("net profit with our model: $"+ str(expected_net_profit_with_our_model))

net profit with our model: $6162.551783048335


### 2.4 Financial value results

In [18]:
print("boost in expected net profit from using our model = $"+ str(expected_net_profit_with_our_model - expected_net_profit_without_model))
print("boost in expected net profit from using perfect model = $"+ str(expected_net_profit_with_perfect_model - expected_net_profit_without_model))

boost in expected net profit from using our model = $861.0685394119719
boost in expected net profit from using perfect model = $2670.7416218181825


# Part 2

## 1. Environment Setup

In [19]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import re

## 2. Twitter Data Scraping
This section will take a long time to run.

In [20]:
key_word = "Zelda"  # Declare the key word used to search tweets
user_name = ""   # Declare a user name used to search tweets
from_date = "2020-01-01" # Declare a start date
end_date = '2021-03-19'  # Declare a end date
count = 5000             # The maximum number of tweets
tweets_list_keyword = [] # A list used to store the returned results for keyword search
tweets_list_user = []    # A list used to store the retuned results for user search

In [21]:
command_keyword = key_word+' since:'+from_date+' until:'+end_date # Define a string command for Scraper Api

print("Scraping data for keyword:", key_word)

for i,tweet in enumerate(sntwitter.TwitterSearchScraper(command_keyword).get_items()):
    tweets_list_keyword.append([tweet.user.username, tweet.mentionedUsers, tweet.retweetedTweet]) # Append returned results to list
    if i>count:
        break;
        
# Create a dataframe from the tweets list above 
tweets_df_keyword = pd.DataFrame(tweets_list_keyword, columns=['username','mentionedusers', 'retweet'])
tweets_df_keyword.to_csv("tweets_keywords.csv",index=False) # Export to a csv file
print("Scraped data have been exported to the csv file")

Scraping data for keyword: Zelda
Scraped data have been exported to the csv file


## 3. Data  Preprocessing

In [22]:
# load the tweets data we scrape
tweets = pd.read_csv('tweets_keywords.csv')

# select columns we need to work with
tweets = tweets[['username','mentionedusers']]

# drop null values, aka tweets with no interactions between users
#tweets = tweets.dropna()

# reset index from 0
tweets = tweets.reset_index()

# drop the index column
df = tweets[['username','mentionedusers']]

In [23]:
df

Unnamed: 0,username,mentionedusers
0,RMDCade,
1,RMDCade,
2,Buster5ive,"[User(username='OnThisDayGaming', id=295461858..."
3,Skullivan_Bones,
4,KdogGaming31,
...,...,...
4997,drecksuser,"[User(username='Zelda_king13', id=952632876817..."
4998,Scifi_SpaceTech,
4999,JesseNiven,
5000,NightLeCapybara,"[User(username='Ulysse_Dupau', id=4824602529, ..."


In [24]:
import numpy as np
for i in range(len(df["username"])):
    string = "username="
    df["mentionedusers"]=df["mentionedusers"].replace(np.nan,string)

temp=[]
for i in range(len(df["username"])):
    temp_set = []
    temp_set.append(df["username"][i])
    temp_set.append(df["mentionedusers"][i])
    temp.append(temp_set)

for i in range(len(temp)):
    if temp[i][1]=="username=":
        temp_string ="username='"+temp[i][0]+"',"
        temp[i][1] = temp_string

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["mentionedusers"]=df["mentionedusers"].replace(np.nan,string)


In [25]:
tweets = pd.DataFrame(temp,columns=["username","mentionedusers"])
tweets

Unnamed: 0,username,mentionedusers
0,RMDCade,"username='RMDCade',"
1,RMDCade,"username='RMDCade',"
2,Buster5ive,"[User(username='OnThisDayGaming', id=295461858..."
3,Skullivan_Bones,"username='Skullivan_Bones',"
4,KdogGaming31,"username='KdogGaming31',"
...,...,...
4997,drecksuser,"[User(username='Zelda_king13', id=952632876817..."
4998,Scifi_SpaceTech,"username='Scifi_SpaceTech',"
4999,JesseNiven,"username='JesseNiven',"
5000,NightLeCapybara,"[User(username='Ulysse_Dupau', id=4824602529, ..."


In [26]:
# calculate the number of mentioned users in a tweet
count = []
for i in range(tweets.shape[0]):
    value = tweets['mentionedusers'][i].count('username=')
    count.append(value)
tweets['#mentionedusers']=pd.Series(count)

In [27]:
tweets

Unnamed: 0,username,mentionedusers,#mentionedusers
0,RMDCade,"username='RMDCade',",1
1,RMDCade,"username='RMDCade',",1
2,Buster5ive,"[User(username='OnThisDayGaming', id=295461858...",1
3,Skullivan_Bones,"username='Skullivan_Bones',",1
4,KdogGaming31,"username='KdogGaming31',",1
...,...,...,...
4997,drecksuser,"[User(username='Zelda_king13', id=952632876817...",2
4998,Scifi_SpaceTech,"username='Scifi_SpaceTech',",1
4999,JesseNiven,"username='JesseNiven',",1
5000,NightLeCapybara,"[User(username='Ulysse_Dupau', id=4824602529, ...",2


In [28]:
# take a look at how many users are mentioned
tweets['#mentionedusers'].value_counts()

1     4378
2      310
3       75
15      58
16      54
4       26
7       20
11      17
5       16
8       13
6        9
10       8
9        5
17       3
12       3
25       2
48       2
19       1
14       1
13       1
Name: #mentionedusers, dtype: int64

In [29]:
# cleaning mentionedusers column
for i in range(tweets.shape[0]):
    string = tweets['mentionedusers'][i]
    users = re.findall(r'username=\S+', string)
    tweets['mentionedusers'][i]=users

# extract usernames
for i in range(tweets.shape[0]):
    for j in range(len(tweets['mentionedusers'][i])):
        newstring = tweets['mentionedusers'][i][j]
        newstring = newstring[10:]
        newstring = newstring[:-2]
        tweets['mentionedusers'][i][j] = newstring

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['mentionedusers'][i]=users


## 4. Interaction records

In [30]:
# initiate two empty list for appending information
user_mentioned = []
user_posting_tweet = []

In [31]:
# append interactions between users to list
for i in range(tweets.shape[0]):
    for j in range(len(tweets['mentionedusers'][i])):
        user_mentioned.append(tweets['mentionedusers'][i][j])
        user_posting_tweet.append(tweets['username'][i])

In [32]:
# create a dataframe for interactions
interactions = pd.DataFrame()
interactions['user_posting_tweet'] = user_posting_tweet
interactions['user_mentioned'] = user_mentioned

In [33]:
# define interaction type
Type = []
for i in range(len(user_posting_tweet)):
    if user_posting_tweet[i]==user_mentioned[i]:
        Type.append("Tweet")
    else:
        Type.append("Non-tweet")
interactions["Type"]=Type

In [34]:
# export to csv
interactions.to_csv("users_interactions.csv",index=False) # Export to a csv file
interactions

Unnamed: 0,user_posting_tweet,user_mentioned,Type
0,RMDCade,RMDCade,Tweet
1,RMDCade,RMDCade,Tweet
2,Buster5ive,OnThisDayGaming,Non-tweet
3,Skullivan_Bones,Skullivan_Bones,Tweet
4,KdogGaming31,KdogGaming31,Tweet
...,...,...,...
8025,Scifi_SpaceTech,Scifi_SpaceTech,Tweet
8026,JesseNiven,JesseNiven,Tweet
8027,NightLeCapybara,Ulysse_Dupau,Non-tweet
8028,NightLeCapybara,obitrie_,Non-tweet


## 5. Mentioned Users Counts

In [35]:
tweets

Unnamed: 0,username,mentionedusers,#mentionedusers
0,RMDCade,[RMDCade],1
1,RMDCade,[RMDCade],1
2,Buster5ive,[OnThisDayGaming],1
3,Skullivan_Bones,[Skullivan_Bones],1
4,KdogGaming31,[KdogGaming31],1
...,...,...,...
4997,drecksuser,"[Zelda_king13, YungGlurak]",2
4998,Scifi_SpaceTech,[Scifi_SpaceTech],1
4999,JesseNiven,[JesseNiven],1
5000,NightLeCapybara,"[Ulysse_Dupau, obitrie_]",2


In [36]:
# count how many times a user is mentioned by others
most = []
for i in range(tweets.shape[0]):
    for j in range(len(tweets['mentionedusers'][i])):
        user = tweets['mentionedusers'][i][j]
        most.append(user)

In [37]:
# see the ranking
most = pd.Series(most)
value_counts = most.value_counts()

# converting to df and assigning new names to the columns
df_value_counts = pd.DataFrame(value_counts)
df_value_counts = df_value_counts.reset_index()
df_value_counts.columns = ['username', 'mentioned_counts'] # change column names
df_value_counts.to_csv("mentioned_users_counts.csv",index=False) # Export to a csv file
df_value_counts

Unnamed: 0,username,mentioned_counts
0,Zeldathons,262
1,Zelda_king13,115
2,sithlord_zelda,87
3,Bonu19RC,87
4,DonnaVita4,87
...,...,...
4120,AlterPichu,1
4121,oopimp420,1
4122,GlaxyDDR,1
4123,ANeckie,1


## 6. degree, betweeness, closeness

In [38]:
# read data
data = pd.read_csv("users_interactions.csv")

In [39]:
# import library for network calculation
import networkx as nx
G = nx.DiGraph()

In [40]:
data

Unnamed: 0,user_posting_tweet,user_mentioned,Type
0,RMDCade,RMDCade,Tweet
1,RMDCade,RMDCade,Tweet
2,Buster5ive,OnThisDayGaming,Non-tweet
3,Skullivan_Bones,Skullivan_Bones,Tweet
4,KdogGaming31,KdogGaming31,Tweet
...,...,...,...
8025,Scifi_SpaceTech,Scifi_SpaceTech,Tweet
8026,JesseNiven,JesseNiven,Tweet
8027,NightLeCapybara,Ulysse_Dupau,Non-tweet
8028,NightLeCapybara,obitrie_,Non-tweet


In [41]:
# create pairs for interactions between users
lst=list()
for (a,b) in zip(data["user_posting_tweet"], data["user_mentioned"]):
    lst.append((a,b))
G.add_edges_from(lst)

In [42]:
lst

[('RMDCade', 'RMDCade'),
 ('RMDCade', 'RMDCade'),
 ('Buster5ive', 'OnThisDayGaming'),
 ('Skullivan_Bones', 'Skullivan_Bones'),
 ('KdogGaming31', 'KdogGaming31'),
 ('WizCoPow', 'WizCoPow'),
 ('ArcturusChusky', 'ArcturusChusky'),
 ('Doge_This101', 'Doge_This101'),
 ('MrCZAR', 'Keegmania'),
 ('boonbap', 'boonbap'),
 ('ExquisiteRed', 'ExquisiteRed'),
 ('Sarse1994', 'Tearastar'),
 ('bluepikachu387', 'bluepikachu387'),
 ('Bearman851', 'Zeldathons'),
 ('aIbedokisser', 'aIbedokisser'),
 ('CammyWammy0507', 'CammyWammy0507'),
 ('PEACHVALENTlNE', 'PEACHVALENTlNE'),
 ('WizardEmu', 'WizardEmu'),
 ('dorepo_toledo', 'dorepo_toledo'),
 ('BZRich64', 'jupy314'),
 ('MarcheX800', 'MarcheX800'),
 ('zeldarp_ebooks', 'zeldarp_ebooks'),
 ('galact0sa', 'SbeveTM'),
 ('marmastry', 'marmastry'),
 ('GradyTTV', 'GradyTTV'),
 ('tartan_bunny', 'probablePenguin'),
 ('clearofthedoors', 'clearofthedoors'),
 ('jaymisaeki', 'Zeldathons'),
 ('DanielJFerguson', 'DanielJFerguson'),
 ('timeforwizards', 'timeforwizards'),
 ('R

In [43]:
# closeness centrality calculation
closeness_centrality=pd.DataFrame.from_dict(nx.closeness_centrality(G), orient='index').reset_index()
closeness_centrality = closeness_centrality.rename(columns={0: 'closeness'})
closeness_centrality

Unnamed: 0,index,closeness
0,RMDCade,0.000000
1,Buster5ive,0.000000
2,OnThisDayGaming,0.000538
3,Skullivan_Bones,0.000000
4,KdogGaming31,0.000000
...,...,...
5576,Scifi_SpaceTech,0.000000
5577,JesseNiven,0.000000
5578,NightLeCapybara,0.000000
5579,Ulysse_Dupau,0.000179


In [44]:
# degree centrality calculation
degree_centrality=pd.DataFrame.from_dict(nx.degree_centrality(G), orient='index').reset_index()
degree_centrality = degree_centrality.rename(columns={0: 'degree'})
degree_centrality

Unnamed: 0,index,degree
0,RMDCade,0.000358
1,Buster5ive,0.000538
2,OnThisDayGaming,0.000896
3,Skullivan_Bones,0.000358
4,KdogGaming31,0.000358
...,...,...
5576,Scifi_SpaceTech,0.000358
5577,JesseNiven,0.000358
5578,NightLeCapybara,0.000358
5579,Ulysse_Dupau,0.000179


In [45]:
# betweeness centrality calculation
betweenness_centrality=pd.DataFrame.from_dict(nx.betweenness_centrality(G), orient='index').reset_index()
betweenness_centrality = betweenness_centrality.rename(columns={0: 'betweenness'})
betweenness_centrality

Unnamed: 0,index,betweenness
0,RMDCade,0.0
1,Buster5ive,0.0
2,OnThisDayGaming,0.0
3,Skullivan_Bones,0.0
4,KdogGaming31,0.0
...,...,...
5576,Scifi_SpaceTech,0.0
5577,JesseNiven,0.0
5578,NightLeCapybara,0.0
5579,Ulysse_Dupau,0.0


In [46]:
# drop betweeness 0 values
betweenness_centrality[betweenness_centrality['betweenness']!=0]

Unnamed: 0,index,betweenness
80,ChaiKovsky,1.124288e-07
81,rbudd913,3.372863e-07
211,spiegelbro,4.031375e-06
212,JustVNTY,1.349145e-06
214,Mynex_,8.030627e-08
217,Mupf05YT,4.545335e-06
222,YungKermit1,8.030627e-08
224,drecksuser,3.854701e-07
254,ZeroKJD,2.569801e-07
299,TrinityBruns,2.569801e-07


## 7. Top 100

Predictor  feature importance (weight)<br>
 network feature 1 = degree centrality<br><br>
       A_B_listed_count            0.182173<br>
  A_B_network_feature_1            0.123855<br> 
  A_B_mentions_received            0.114590<br>
    A_B_follower_count            0.111455<br>
 A_B_retweets_received            0.087582<br>


### User info scraping
We comment this part out because it takes a long time to scrape data from Twitter. We also need to deal with authorization problems. The scraped data will be directly read from a csv we provided. 

In [47]:
# # get user information for the predictive model from all users appreared in our 5000 scraped tweets
#Score = w1 × retweets + w2 × listed_count + w3 × #followers + w4 × network_feature_1, where w1+w2+w3+ w4 = 1.
# !pip install tweepy

In [48]:
import pandas as pd
df_value_counts = pd.read_csv("mentioned_users_counts.csv")
df_value_counts

Unnamed: 0,username,mentioned_counts
0,Zeldathons,262
1,Zelda_king13,115
2,sithlord_zelda,87
3,Bonu19RC,87
4,DonnaVita4,87
...,...,...
4120,AlterPichu,1
4121,oopimp420,1
4122,GlaxyDDR,1
4123,ANeckie,1


In [49]:
# import the module
# import tweepy
  
# assign the values accordingly

# consumer_key = "ivL04VmAWKxWjeIM6TaeW7qus"
# consumer_secret = "wjQDHANXQOjljEthdFLHypqZMxJRRpTgVAayJFzegAPDEaCycD"
# access_token = "1490765033697165315-ycria79DhrhyfPXbx8ttWoOSsACdvT"
# access_token_secret = "mF9ltmDDQtTo8vT7jI7F7QpPL4BCWRYsN11akPJOjySqJ"
  
# authorization of consumer key and consumer secret
# auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
  
# set access to user's access key and access secret 
# auth.set_access_token(access_token, access_token_secret)
  
# calling the api 
# api = tweepy.API(auth)
  

In [50]:
#api.get_user(screen_name="Zeldathons")
#api.get_user(screen_name="Zeldathons").listed_count

In [51]:
# the screen_name of the targeted user
# user_=df_value_counts["username"]
# name=[]
# followers=[]
# listed_count=[]
# user_

Please don't run the following scraping code, it takes a long time

In [52]:
#import time
#import datetime
#for batch in range(10,21):
#    if batch <20:
#        start = time.time()
#        for i in user_[batch*200:batch*200+199]:
#            try:
#                name.append(api.get_user(screen_name=i).screen_name)
#                followers.append(api.get_user(screen_name=i).followers_count)
#                listed_count.append(api.get_user(screen_name=i).listed_count)
#            except:
#                print("The username",i,"no longer exists or got suspended")
#        print("batch",batch,"complete")
#        end = time.time()
#        print("Time used for batch",batch,":",(end - start)/60,"min")
#        ct = datetime.datetime.now()
#        print("current timestamp:", ct)
#        time.sleep(60*15) #wait for 15min refresh,1min used for buffer
#    else:
#        for i in user_[4000:]:
#            try:
#                name.append(api.get_user(screen_name=i).screen_name)
#                followers.append(api.get_user(screen_name=i).followers_count)
#                listed_count.append(api.get_user(screen_name=i).listed_count)
#            except:
#                print("The username",i,"no longer exists or got suspended")

In [53]:
# followers

In [54]:
# listed_count

In [55]:
# name

In [56]:
# score = pd.DataFrame(columns=['Username', 'followers', 'listed_count'])

In [57]:
# degree=[]
# mentioned=[]
#for i in name:
 #   degree.append(degree_centrality.loc[degree_centrality['index'] == i].degree.values.tolist()[0])
  #  mentioned.append(df_value_counts.loc[df_value_counts['username'] == i].mentioned_counts.values.tolist()[0])

In [58]:
# degree

In [59]:
# mentioned

In [60]:
# score["Username"]=name
# score["followers"]=followers
# score["listed_count"]=listed_count
# score["degree"]=degree
# score["mentioned"]=mentioned

In [61]:
# score.to_csv("score_prep.csv",index=False)

In [62]:
# Using another twitter account for scraping

# assign the values accordingly
# consumer_key = "hrM7rHMyKmwQDHdE3ky79c1RG"
# consumer_secret = "CvNNN40H2tuSekmw47xoQaVI3h8hAfYdcB8s2qw0MHNx7dKwIo"
# access_token = "1507092783970099200-0y6VEFFzeu1rOR9kYpjsb3d8GXsLCH"
# access_token_secret = "q7Lx6o1OiI7O53B0fx00S6jROqUB6odzMyyVK8J4XM2q4"
  
# authorization of consumer key and consumer secret
# auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
  
# set access to user's access key and access secret 
# auth.set_access_token(access_token, access_token_secret)
  
# calling the api 
# api = tweepy.API(auth)
  

Please don't run the following scraping code, it takes a long time

In [63]:
#import time
#import datetime
#for batch in range(10):
#        start=time.time()
#        for i in user_[batch*200:batch*200+199]:
#            try:
#                followers.append(api.get_user(screen_name=i).followers_count)
#                listed_count.append(api.get_user(screen_name=i).listed_count)
#                name.append(api.get_user(screen_name=i).screen_name)
#            except:
#                print("The username",i,"no longer exists or got suspended")
#        print("batch",batch,"complete")
#        end=time.time()
#        print("Time used for batch", batch,":",(end-start)/60,"min")
#        ct=datetime.datetime.now()
#        print("current timestamp:", ct)
#        time.sleep(60*15) #wait for 15min refresh,1min used for buffer

In [64]:
# score_ = pd.DataFrame(columns=['Username', 'followers', 'listed_count'])
# degree=[]
# mentioned=[]
#for i in name:
 #   degree.append(degree_centrality.loc[degree_centrality['index'] == i].degree.values.tolist())
  #  mentioned.append(df_value_counts.loc[df_value_counts['username'] == i].mentioned_counts.values.tolist())
# score_["Username"]=name
# score_["followers"]=followers
# score_["listed_count"]=listed_count
# score_["degree"]=degree
# score_["mentioned"]=mentioned

In [65]:
#for i in range(len(score_)):
 #   if len(score_.degree[i])<1:
  #      score_.degree[i] = [0]
#score_.loc[score_.Username == 'LAVEESHA_']

In [66]:
#for i in range(len(score_)):
 #   score_['degree'][i] = score_['degree'][i][0]
  #  score_['mentioned'][i] = score_['mentioned'][i][0]
#score_

In [67]:
#score_.to_csv("/Users/corrine/Desktop/score_prep4.csv",index=False)

In [68]:
# score1=pd.read_csv("score_prep4.csv")

In [69]:
# score1

In [70]:
# frames = [score1,score]
# score_final = pd.concat(frames)
#score_final = score_final.reset_index()
#score_final = score_final.drop(columns="index")
#score_final

In [71]:
#score_final.to_csv("score_prep_final.csv",index=False)

In [72]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,Normalizer

# read already scraped data directly from csv we provided
score_final=pd.read_csv("score_prep_final.csv")
# we want to filter out users who are mentioned less than 3 times
score_final = score_final.drop(score_final[score_final.mentioned < 3].index)

In [73]:
# standardization
X_score = score_final.iloc[:,1:]
scaler = StandardScaler()
X_norm = scaler.fit_transform(X_score)

In [74]:
# result of standardization
df_final = pd.DataFrame(X_norm,columns=["followers","listed_count","degree","mentioned"])
df_final["Username"]=score_final["Username"]
df_final

Unnamed: 0,followers,listed_count,degree,mentioned,Username
0,-0.115482,-0.133794,13.407544,9.748261,Zeldathons
1,-0.114875,-0.132522,3.386342,3.950143,Zelda_king13
2,-0.115426,-0.133031,-0.007935,2.853201,tweetNorvena
3,-0.115399,-0.133158,-0.007935,2.853201,DonnaVita4
4,-0.115449,-0.133286,-0.007935,2.853201,Bonu19RC
...,...,...,...,...,...
253,-0.108746,-0.104669,-0.331200,-0.437622,DullVivid
254,-0.115267,-0.131887,-0.277323,-0.437622,writersflood
255,-0.111326,-0.117642,-0.331200,-0.437622,sme_rt
256,-0.115483,-0.133794,-0.385077,-0.437622,ghosty_geist


In [75]:
# Assign weight to each feature

#A_B_listed_count 0.182173
#A_B_network_feature_1 0.123855
#A_B_mentions_received 0.114590
#A_B_follower_count 0.111455
#A_B_retweets_received 0.087582
ratio = 1.0/(0.182173+0.123855+0.114590+0.111455)
adjust = 0.05
w_list = 0.182173*ratio+adjust
w_feat = 0.123855*ratio
w_follow = 0.111455*ratio
w_mention = 0.114590*ratio-adjust

# calculate popularity score for each user
rate=[]
for i in range(len(score_final)):
    rate.append(w_list*df_final["listed_count"][i]
                +w_feat*df_final["degree"][i]
                +w_follow*df_final["followers"][i]
                +w_mention*df_final["mentioned"][i])
rate

[4.656317973132313,
 1.365420068573305,
 0.39359505464566086,
 0.39355081139804815,
 0.3934904547824353,
 0.3934438020592338,
 0.393437296475144,
 0.39348726222728014,
 0.5626900931857998,
 0.612207101152242,
 1.4404644363341208,
 0.26527495089855124,
 0.2651224935743298,
 0.26521956382634027,
 0.2651665259928284,
 0.2652229672106095,
 0.4591110401570583,
 0.43296946695237437,
 0.35904839230169144,
 0.5677966044390448,
 0.2424134305636481,
 0.5544786570362805,
 0.4906036911417966,
 0.284590432239603,
 0.3525589675625226,
 0.3509380401799923,
 0.4283563970081145,
 0.35796638806428654,
 0.4906581318228249,
 0.269396967760069,
 0.11747097670551748,
 0.2241285996992324,
 0.15425485396961638,
 -0.04390384842091915,
 -0.04561680954418646,
 -0.043675613089429556,
 -0.041136173675396834,
 -0.04096483208624807,
 -0.03377126820511681,
 -0.0411162360127863,
 0.03451066279800741,
 -0.0404113536518087,
 -0.04064079222531895,
 -0.039958887994109044,
 -0.04100675696149339,
 0.009345034476940371,
 1.1

In [76]:
# get top 100 influencers
score_final["score"]=rate
score_final=score_final.sort_values(by="score",ascending=False).reset_index()
popularity = score_final.head(100)
popularity=popularity.drop(columns="index")
popularity

Unnamed: 0,Username,followers,listed_count,degree,mentioned,score
0,elonmusk,78986831,88007,0.000897,6,6.589361
1,YouTube,74700911,79895,0.002333,13,6.201123
2,Zeldathons,6,0,0.046303,263,4.656318
3,PlayStation,24489619,34212,0.000538,3,2.218804
4,nerdist,481964,4772,0.013819,78,1.440464
...,...,...,...,...,...,...
95,PenOfSmiting,2590,31,0.001615,4,-0.142800
96,RanguGamer,90331,77,0.001256,7,-0.143510
97,Scotland1509,244,4,0.001615,4,-0.144219
98,GenesaurusRex,452,3,0.001615,4,-0.144262


In [78]:
popularity

Unnamed: 0,Username,followers,listed_count,degree,mentioned,score
0,elonmusk,78986831,88007,0.000897,6,6.589361
1,YouTube,74700911,79895,0.002333,13,6.201123
2,Zeldathons,6,0,0.046303,263,4.656318
3,PlayStation,24489619,34212,0.000538,3,2.218804
4,nerdist,481964,4772,0.013819,78,1.440464
...,...,...,...,...,...,...
95,PenOfSmiting,2590,31,0.001615,4,-0.142800
96,RanguGamer,90331,77,0.001256,7,-0.143510
97,Scotland1509,244,4,0.001615,4,-0.144219
98,GenesaurusRex,452,3,0.001615,4,-0.144262


In [79]:
# export to csv
popularity.to_csv("popularity.csv",index=False)

# 8. Visualization

In [80]:
# read data
popularity=pd.read_csv('popularity.csv')
interactions=pd.read_csv('users_interactions.csv')

In [81]:
# list of top 100 influencers
top=popularity['Username']

In [82]:
# create dataframe for visualization
viz=interactions[interactions['user_mentioned'].isin(top)]
viz=viz.reset_index(drop=True)
viz['Type']=np.where(viz['Type']=='Non-tweet',0,1)
viz

Unnamed: 0,user_posting_tweet,user_mentioned,Type
0,Bearman851,Zeldathons,0
1,jaymisaeki,Zeldathons,0
2,DeMistyB,Zeldathons,0
3,USMC1683,HitCockBottom,0
4,minimarker3,PowerPlayRPG,0
...,...,...,...
2983,YvoDreamcatcher,Anna55714043,0
2984,tjamara_,Zelda_king13,0
2985,tjamara_,drecksuser,0
2986,drecksuser,Zelda_king13,0


In [83]:
# export dataframe
viz.to_csv("visualization.csv",index=False)