# INSY 670 Group Project Part 2
Alice Liu, Diwei Zhu, Yingxin Jiang, Kexin Wang, Yichen Wang

# Part 1

## 1.  Predictive Model

### 1.1 Environment Setup

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler,Normalizer
#!pip install decorator==5.0.9 

### 1.2 Data Pre-processing

In [None]:
train = pd.read_csv('train.csv')
train.info()

In [None]:
df = pd.DataFrame(columns = ["A_B_follower_count","A_B_following_count","A_B_listed_count","A_B_mentions_received",
                "A_B_retweets_received","A_B_mentions_sent","A_B_retweets_sent","A_B_posts","A_B_network_feature_1",
                "A_B_network_feature_2","A_B_network_feature_3"])

In [None]:
for i in range(11):
    series = train.iloc[:,i+1]-train.iloc[:,i+1+11]
    df.iloc[:,i] = series
df['Choice'] = train['Choice']

### 1.3 Features Selection

In [None]:
### 应该是 X = df.iloc[:,:11]
#X = df.iloc[:,:10]
X = df.iloc[:,:11]
y = df['Choice']

scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [None]:
randomforest = RandomForestClassifier(random_state=42)
model = randomforest.fit(X_std,y)
importance = pd.DataFrame(list(zip(X.columns,model.feature_importances_)),columns=
             ['predictor','feature importance'])
print(importance.sort_values(by=['feature importance'],ascending = False).head(5))

In [None]:
df2 = df[['A_B_listed_count','A_B_network_feature_1','A_B_mentions_received','A_B_follower_count','A_B_retweets_received']]

### 1.4 Binary Classification

#### 1.4.1 Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_std,y,test_size=0.33,random_state=5)

#### 1.4.2 Logistic Regression

In [None]:
# Run the Model
lr = LogisticRegression()
model_lr = lr.fit(X_std,y)

# Calculate the accuracy score
y_test_pred = model_lr.predict(X_test)
print('Logistic Regression Accuracy Score:',accuracy_score(y_test,y_test_pred))

# Confusion Matrix
print('Logistic Regression Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test,y_test_pred,labels=[0,1]),index=['true:0','true:1'],columns=['pred:0','pred:1']))

#### 1.4.3 Random Forest

In [None]:
# Run the Model
randomforest = RandomForestClassifier(random_state=42)
model_rf = randomforest.fit(X_train,y_train)

# Calculate the accuracy score
y_test_pred = model_rf.predict(X_test)
print('Random Forest Accuracy Score:',accuracy_score(y_test,y_test_pred))

# Confusion Matrix
print('Random Forest Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test,y_test_pred,labels=[0,1]),index=['true:0','true:1'],columns=['pred:0','pred:1']))

#### 1.4.4 Gradient Boosting

In [None]:
# Run the Model
gbt = GradientBoostingClassifier(random_state=42)
model_gbt = gbt.fit(X_train,y_train)

# Calculate the accuracy score
y_test_pred = model_gbt.predict(X_test)
print('Gradient Boosting Accuracy Score:',accuracy_score(y_test,y_test_pred))

# Confusion Matrix
print('Gradient Boosting Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test,y_test_pred,labels=[0,1]),index=['true:0','true:1'],columns=['pred:0','pred:1']))

According to accuracy score, we see that gradient boosting has the best performance.

### 1.5 Results Analysis

**1. Which factors are best predictors of influence?**<br>
According to the result of Random Forest's feature importance, we selected the top 5 factors: difference between A and B in 'A_B_listed_count','A_B_network_feature_1','A_B_mentions_received','A_B_follower_count', and 'A_B_retweets_received'.<br><br>

**2. Are there any surprises here?** <br>
We are not surprised by the result, because the more influential a person is on social media, the more interactive they become (mentions_received, retweets_received), the more popularity they gain (listed_count, follower_count).<br><br>

**3. How can a business use your model/results?** <br>
With our model, companies are able to find the most interactive and popular influencers among all users in Twitter to promote their products.

## 2. Financial value calculation

Influencers tweet twice: <br>
profit margin: PM_tweet twice = -\\$10 + \\$10\*0.03%\*followers <br>

Influencers tweet once: <br>
profit margin: PM_tweet once = -\\$5 + \\$10\*0.02%\*followers <br>

Normal users tweet once: <br> 
profit margin: PM_non influencer = -\\$5

### 2.1 expected net profit without model

Make every user tweet once

In [None]:
A_follower_avg = train["A_follower_count"].mean()
B_follower_avg = train["B_follower_count"].mean()

In [None]:
cost1 = 8*(-5)
income_influencer1 = 2*A_follower_avg*0.0002*10 + 2*B_follower_avg*0.0002*10  # John, Sue, Sandy & Moe
income_non_influencer1 = 0  # Ted, Ron, Fred & Alex

expected_net_profit_without_model = cost1 + income_influencer1 + income_non_influencer1 

print("net profit without the model: $"+ str(expected_net_profit_without_model))

### 2.2 expected net profit with the perfect model

Precisely identify all influencers

In [None]:
cost2 = 4*(-10)
income_influencer2 = 2*A_follower_avg*0.0003*10 + 2*B_follower_avg*0.0003*10  # John, Sue, Sandy & Moe
income_non_influencer2 = 0  # Ted, Ron, Fred & Alex

expected_net_profit_with_perfect_model = cost2 + income_influencer2 + income_non_influencer2 

print("net profit with the perfect model: $"+ str(expected_net_profit_with_perfect_model))

### 2.3 expected net profit with our model (accuracy = 0.7769)

~77% of the influencers would be successfully identified by our model.

In [None]:
accuracy = accuracy_score(y_test,y_test_pred)

In [None]:
accuracy

In [None]:
cost3 = 4*(-10)*accuracy
income_influencer3 = 2*A_follower_avg*0.0003*10*accuracy + 2*B_follower_avg*0.0003*10*accuracy
income_non_influencer3 = 0

expected_net_profit_with_our_model = cost3 + income_influencer3 + income_non_influencer3 

print("net profit with our model: $"+ str(expected_net_profit_with_our_model))

### 2.4 Financial value results

In [None]:
print("boost in expected net profit from using our model = $"+ str(expected_net_profit_with_our_model - expected_net_profit_without_model))
print("boost in expected net profit from using perfect model = $"+ str(expected_net_profit_with_perfect_model - expected_net_profit_without_model))

# Part 2

## 1. Environment Setup

In [None]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import re

## 2. Twitter Data Scraping

In [None]:
key_word = "Zelda"  # Declare the key word used to search tweets
user_name = ""   # Declare a user name used to search tweets
from_date = "2020-01-01" # Declare a start date
end_date = '2021-03-19'  # Declare a end date
count = 5000             # The maximum number of tweets
tweets_list_keyword = [] # A list used to store the returned results for keyword search
tweets_list_user = []    # A list used to store the retuned results for user search

In [None]:
command_keyword = key_word+' since:'+from_date+' until:'+end_date # Define a string command for Scraper Api

print("Scraping data for keyword:", key_word)

for i,tweet in enumerate(sntwitter.TwitterSearchScraper(command_keyword).get_items()):
    tweets_list_keyword.append([tweet.user.username, tweet.mentionedUsers, tweet.retweetedTweet]) # Append returned results to list
    if i>count:
        break;
        
# Create a dataframe from the tweets list above 
tweets_df_keyword = pd.DataFrame(tweets_list_keyword, columns=['username','mentionedusers', 'retweet'])
tweets_df_keyword.to_csv("tweets_keywords.csv",index=False) # Export to a csv file
print("Scraped data have been exported to the csv file")

## 3. Data  Preprocessing

In [None]:
# load the tweets data we scrape
tweets = pd.read_csv('tweets_keywords.csv')

# select columns we need to work with
tweets = tweets[['username','mentionedusers']]

# drop null values, aka tweets with no interactions between users
#tweets = tweets.dropna()

# reset index from 0
tweets = tweets.reset_index()

# drop the index column
df = tweets[['username','mentionedusers']]

In [None]:
df

In [None]:
import numpy as np
for i in range(len(df["username"])):
    string = "username="
    df["mentionedusers"]=df["mentionedusers"].replace(np.nan,string)

temp=[]
for i in range(len(df["username"])):
    temp_set = []
    temp_set.append(df["username"][i])
    temp_set.append(df["mentionedusers"][i])
    temp.append(temp_set)

for i in range(len(temp)):
    if temp[i][1]=="username=":
        temp_string ="username='"+temp[i][0]+"',"
        temp[i][1] = temp_string

In [None]:
tweets = pd.DataFrame(temp,columns=["username","mentionedusers"])
tweets

In [None]:
# calculate the number of mentioned users in a tweet
count = []
for i in range(tweets.shape[0]):
    value = tweets['mentionedusers'][i].count('username=')
    count.append(value)
tweets['#mentionedusers']=pd.Series(count)

In [None]:
tweets

In [None]:
# take a look at how many users are mentioned
tweets['#mentionedusers'].value_counts()

In [None]:
# cleaning mentionedusers column
for i in range(tweets.shape[0]):
    string = tweets['mentionedusers'][i]
    users = re.findall(r'username=\S+', string)
    tweets['mentionedusers'][i]=users

# extract usernames
for i in range(tweets.shape[0]):
    for j in range(len(tweets['mentionedusers'][i])):
        newstring = tweets['mentionedusers'][i][j]
        newstring = newstring[10:]
        newstring = newstring[:-2]
        tweets['mentionedusers'][i][j] = newstring

## 4. Interaction records

In [None]:
user_mentioned = []
user_posting_tweet = []

In [None]:
for i in range(tweets.shape[0]):
    for j in range(len(tweets['mentionedusers'][i])):
        user_mentioned.append(tweets['mentionedusers'][i][j])
        user_posting_tweet.append(tweets['username'][i])

In [None]:
interactions = pd.DataFrame()
interactions['user_posting_tweet'] = user_posting_tweet
interactions['user_mentioned'] = user_mentioned

In [None]:
Type = []
for i in range(len(user_posting_tweet)):
    if user_posting_tweet[i]==user_mentioned[i]:
        Type.append("Tweet")
    else:
        Type.append("Non-tweet")
interactions["Type"]=Type

In [None]:
interactions.to_csv("users_interactions.csv",index=False) # Export to a csv file
interactions

## 5. Mentioned Users Counts

In [None]:
tweets

In [None]:
# count how many times a user is mentioned by others
most = []
for i in range(tweets.shape[0]):
    for j in range(len(tweets['mentionedusers'][i])):
        user = tweets['mentionedusers'][i][j]
        most.append(user)

In [None]:
# see the ranking
most = pd.Series(most)
value_counts = most.value_counts()

# converting to df and assigning new names to the columns
df_value_counts = pd.DataFrame(value_counts)
df_value_counts = df_value_counts.reset_index()
df_value_counts.columns = ['username', 'mentioned_counts'] # change column names
df_value_counts.to_csv("mentioned_users_counts.csv",index=False) # Export to a csv file
df_value_counts

## 6. degree, betweeness, closeness

In [None]:
data = pd.read_csv("users_interactions.csv")

In [None]:
import networkx as nx
G = nx.DiGraph()

In [None]:
data

In [None]:
lst=list()
for (a,b) in zip(data["user_posting_tweet"], data["user_mentioned"]):
    lst.append((a,b))
G.add_edges_from(lst)

In [None]:
lst

In [None]:
closeness_centrality=pd.DataFrame.from_dict(nx.closeness_centrality(G), orient='index').reset_index()
closeness_centrality = closeness_centrality.rename(columns={0: 'closeness'})
closeness_centrality

In [None]:
degree_centrality=pd.DataFrame.from_dict(nx.degree_centrality(G), orient='index').reset_index()
degree_centrality = degree_centrality.rename(columns={0: 'degree'})
degree_centrality

In [None]:
#!pip install decorator==5.0.9
betweenness_centrality=pd.DataFrame.from_dict(nx.betweenness_centrality(G), orient='index').reset_index()
betweenness_centrality = betweenness_centrality.rename(columns={0: 'betweenness'})
betweenness_centrality

In [None]:
betweenness_centrality[betweenness_centrality['betweenness']!=0]

## 7. top 100

Predictor  feature importance (weight)<br>
 network feature 1 = degree centrality<br><br>
       A_B_listed_count            0.182173<br>
  A_B_network_feature_1            0.123855<br> 
  A_B_mentions_received            0.114590<br>
    A_B_follower_count            0.111455<br>
 A_B_retweets_received            0.087582<br>


In [None]:
G2 = nx.DiGraph()

In [None]:
lst2=list()
for (a,b) in zip(data["user_mentioned"], data["user_posting_tweet"]):
    lst2.append((a,b))
G2.add_edges_from(lst2)

In [None]:
import matplotlib.pyplot as plt
#nx.draw(G2)
#plt.show()

# Diwei：user info scraping
### P.S. model里的retweets received 的feature scrape，用我们的API做不出来，建议modify model跳过=-=
### Yichen: 你们可以试试在你们电脑上用我的key能不能进行scrape

In [None]:
# # get user information for the predictive model from all users appreared in our 5000 scraped tweets
#Score = w1 × retweets + w2 × listed_count + w3 × #followers + w4 × network_feature_1, where w1+w2+w3+ w4 = 1.
#!pip install tweepy

In [None]:
import pandas as pd
df_value_counts = pd.read_csv("mentioned_users_counts.csv")
df_value_counts

In [None]:
# import the module
import tweepy
  
# assign the values accordingly
consumer_key = "ivL04VmAWKxWjeIM6TaeW7qus"
consumer_secret = "wjQDHANXQOjljEthdFLHypqZMxJRRpTgVAayJFzegAPDEaCycD"
access_token = "1490765033697165315-ycria79DhrhyfPXbx8ttWoOSsACdvT"
access_token_secret = "mF9ltmDDQtTo8vT7jI7F7QpPL4BCWRYsN11akPJOjySqJ"
  
# authorization of consumer key and consumer secret
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
  
# set access to user's access key and access secret 
auth.set_access_token(access_token, access_token_secret)
  
# calling the api 
api = tweepy.API(auth)
  


In [None]:
#api.get_user(screen_name="Zeldathons")
#api.get_user(screen_name="Zeldathons").listed_count

In [None]:
# the screen_name of the targeted user
user_=df_value_counts["username"]
name=[]
followers=[]
listed_count=[]
user_

In [None]:
import time
import datetime
for batch in range(10,21):
    if batch <20:
        start = time.time()
        for i in user_[batch*200:batch*200+199]:
            try:
                name.append(api.get_user(screen_name=i).screen_name)
                followers.append(api.get_user(screen_name=i).followers_count)
                listed_count.append(api.get_user(screen_name=i).listed_count)
            except:
                print("The username",i,"no longer exists or got suspended")
        print("batch",batch,"complete")
        end = time.time()
        print("Time used for batch",batch,":",(end - start)/60,"min")
        ct = datetime.datetime.now()
        print("current timestamp:", ct)
        time.sleep(60*15) #wait for 15min refresh,1min used for buffer
    else:
        for i in user_[4000:]:
            try:
                name.append(api.get_user(screen_name=i).screen_name)
                followers.append(api.get_user(screen_name=i).followers_count)
                listed_count.append(api.get_user(screen_name=i).listed_count)
            except:
                print("The username",i,"no longer exists or got suspended")

In [None]:
followers

In [None]:
listed_count

In [None]:
name

In [None]:
score = pd.DataFrame(columns=['Username', 'followers', 'listed_count'])

In [None]:
degree=[]
mentioned=[]
for i in name:
    degree.append(degree_centrality.loc[degree_centrality['index'] == i].degree.values.tolist()[0])
    mentioned.append(df_value_counts.loc[df_value_counts['username'] == i].mentioned_counts.values.tolist()[0])

In [None]:
degree

In [None]:
mentioned

In [None]:
score["Username"]=name
score["followers"]=followers
score["listed_count"]=listed_count
score["degree"]=degree
score["mentioned"]=mentioned

In [None]:
score.to_csv("score_prep.csv",index=False)

In [None]:
score1=pd.read_csv("score_prep4.csv")

In [None]:
score1

In [None]:
frames = [score1,score]
score_final = pd.concat(frames)
score_final = score_final.reset_index()
score_final = score_final.drop(columns="index")
score_final

In [None]:
score_final.to_csv("score_prep_final.csv",index=False)

In [30]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,Normalizer

score_final=pd.read_csv("score_prep_final.csv")
score_final = score_final.drop(score_final[score_final.mentioned < 3].index)

In [31]:
X_score = score_final.iloc[:,1:]
scaler = StandardScaler()
X_norm = scaler.fit_transform(X_score)

In [32]:
df_final = pd.DataFrame(X_norm,columns=["followers","listed_count","degree","mentioned"])
df_final["Username"]=score_final["Username"]
df_final

Unnamed: 0,followers,listed_count,degree,mentioned,Username
0,-0.115482,-0.133794,13.407544,9.748261,Zeldathons
1,-0.114875,-0.132522,3.386342,3.950143,Zelda_king13
2,-0.115426,-0.133031,-0.007935,2.853201,tweetNorvena
3,-0.115399,-0.133158,-0.007935,2.853201,DonnaVita4
4,-0.115449,-0.133286,-0.007935,2.853201,Bonu19RC
...,...,...,...,...,...
253,-0.108746,-0.104669,-0.331200,-0.437622,DullVivid
254,-0.115267,-0.131887,-0.277323,-0.437622,writersflood
255,-0.111326,-0.117642,-0.331200,-0.437622,sme_rt
256,-0.115483,-0.133794,-0.385077,-0.437622,ghosty_geist


In [33]:
#A_B_listed_count 0.182173
#A_B_network_feature_1 0.123855
#A_B_mentions_received 0.114590
#A_B_follower_count 0.111455
#A_B_retweets_received 0.087582
ratio = 1.0/(0.182173+0.123855+0.114590+0.111455)
adjust = 0.05
w_list = 0.182173*ratio+adjust
w_feat = 0.123855*ratio
w_follow = 0.111455*ratio
w_mention = 0.114590*ratio-adjust

rate=[]
for i in range(len(score_final)):
    rate.append(w_list*df_final["listed_count"][i]
                +w_feat*df_final["degree"][i]
                +w_follow*df_final["followers"][i]
                +w_mention*df_final["mentioned"][i])
rate

[4.656317973132313,
 1.365420068573305,
 0.39359505464566086,
 0.39355081139804815,
 0.3934904547824353,
 0.3934438020592338,
 0.393437296475144,
 0.39348726222728014,
 0.5626900931857998,
 0.612207101152242,
 1.4404644363341208,
 0.26527495089855124,
 0.2651224935743298,
 0.26521956382634027,
 0.2651665259928284,
 0.2652229672106095,
 0.4591110401570583,
 0.43296946695237437,
 0.35904839230169144,
 0.5677966044390448,
 0.2424134305636481,
 0.5544786570362805,
 0.4906036911417966,
 0.284590432239603,
 0.3525589675625226,
 0.3509380401799923,
 0.4283563970081145,
 0.35796638806428654,
 0.4906581318228249,
 0.269396967760069,
 0.11747097670551748,
 0.2241285996992324,
 0.15425485396961638,
 -0.04390384842091915,
 -0.04561680954418646,
 -0.043675613089429556,
 -0.041136173675396834,
 -0.04096483208624807,
 -0.03377126820511681,
 -0.0411162360127863,
 0.03451066279800741,
 -0.0404113536518087,
 -0.04064079222531895,
 -0.039958887994109044,
 -0.04100675696149339,
 0.009345034476940371,
 1.1

In [34]:
print(w_mention)
print(w_list)

0.16536518485245444
0.39238346993739576


In [35]:
score_final["score"]=rate
score_final=score_final.sort_values(by="score",ascending=False).reset_index()
popularity = score_final.head(100)

In [36]:
popularity=popularity.drop(columns="index")

In [37]:
popularity

Unnamed: 0,Username,followers,listed_count,degree,mentioned,score
0,elonmusk,78986831,88007,0.000897,6,6.589361
1,YouTube,74700911,79895,0.002333,13,6.201123
2,Zeldathons,6,0,0.046303,263,4.656318
3,PlayStation,24489619,34212,0.000538,3,2.218804
4,nerdist,481964,4772,0.013819,78,1.440464
...,...,...,...,...,...,...
95,PenOfSmiting,2590,31,0.001615,4,-0.142800
96,RanguGamer,90331,77,0.001256,7,-0.143510
97,Scotland1509,244,4,0.001615,4,-0.144219
98,GenesaurusRex,452,3,0.001615,4,-0.144262


In [38]:
popularity.to_csv("popularity.csv",index=False)

# 8. Visualization

In [39]:
popularity=pd.read_csv('popularity.csv')
interactions=pd.read_csv('users_interactions.csv')

In [40]:
top=popularity['Username']

In [41]:
viz=interactions[interactions['user_mentioned'].isin(top)]
viz=viz.reset_index(drop=True)

In [42]:
viz['Type']=np.where(viz['Type']=='Non-tweet',0,1)
viz

Unnamed: 0,user_posting_tweet,user_mentioned,Type
0,Bearman851,Zeldathons,0
1,jaymisaeki,Zeldathons,0
2,DeMistyB,Zeldathons,0
3,USMC1683,HitCockBottom,0
4,minimarker3,PowerPlayRPG,0
...,...,...,...
2984,YvoDreamcatcher,Anna55714043,0
2985,tjamara_,Zelda_king13,0
2986,tjamara_,drecksuser,0
2987,drecksuser,Zelda_king13,0


In [43]:
viz.to_csv("visualization.csv",index=False)