# Twitter user recommendation
## Models and algorithms

In [60]:
import sys
stdin, stdout, stderr = sys.stdin, sys.stdout, sys.stderr
reload(sys)
sys.stdin, sys.stdout, sys.stderr = stdin, stdout, stderr
sys.setdefaultencoding('utf8')
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import simplejson as json
import numpy as np
import pandas as pd
from afinn import Afinn
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
import json
import operator

## Filter out bots and boring users

In [2]:
## clean and tokenize tweets
def rm_html_tags(str): ## discard tags
    html_prog = re.compile(r'<[^>]+>',re.S)
    return html_prog.sub('', str) ## re.sub(pattern, repl, string)

def rm_html_escape_characters(str):
    pattern_str = r'&quot;|&amp;|&lt;|&gt;|&nbsp;|&#34;|&#38;|&#60;|&#62;|&#160;|&#20284;|&#30524;|&#26684|&#43;|&#20540|&#23612;'
    escape_characters_prog = re.compile(pattern_str, re.S)
    return escape_characters_prog.sub('', str)

def rm_at_user(str):
    return re.sub(r'@[a-zA-Z_0-9]*', '', str)

def rm_url(str):
    return re.sub(r'http[s]?:[/+]?[a-zA-Z0-9_\.\/]*', '', str)

def rm_repeat_chars(str):
    return re.sub(r'(.)(\1){2,}', r'\1\1', str)

def rm_hashtag_symbol(str):
    return re.sub(r'#', '', str)

def replace_emoticon(emoticon_dict, str):
    for k, v in emoticon_dict.items():
        str = str.replace(k, v)
    return str

def rm_time(str):
    return re.sub(r'[0-9][0-9]:[0-9][0-9]', '', str)

def rm_punctuation(current_tweet):
    return re.sub(r'[^\w\s]','',current_tweet)


def pre_process(str0):
    # do not change the preprocessing order only if you know what you're doing 
    str0 = str0.lower()
    str0 = rm_url(str0)        
    str0 = rm_at_user(str0) ## think of what you can extract from user's name?       
    str0 = rm_repeat_chars(str0) ## the num of repeats may count?
    str0 = rm_hashtag_symbol(str0) ## hashtags is important!      
    str0 = rm_time(str0)        
    str0 = rm_punctuation(str0)
        
    str0 = nltk.tokenize.word_tokenize(str0) 
    str0 = [s for s in str0 if s not in stops]

    return str0

## generate list containing all the stopwords
stops = []
path = './stopwords'
for filename in os.listdir(path):
    with open(os.path.join(path, filename).encode('utf-8')) as f:
        line = f.readline()
        stop = [i.strip() for i in line.split(',')]
        stops = stops + stop

stops = stops + list(stopwords.words('english'))

In [3]:
## detect url and at
def detect_url(str):
    prog = re.compile(r'http[s]?:[/+]?[a-zA-Z0-9_\.\/]*')
    result = prog.search(str)
    if result:
        return 1
    else:
        return 0

def detect_at_user(str):
    prog = re.compile(r'@[a-zA-Z_0-9]*')
    result = prog.search(str)
    if result:
        return 1
    else:
        return 0

Get user tweets stats

In [42]:
data_dir = './users with tweets before filtering'  ##Setting your own file path here.
tweets = 'new_tweets_part1.txt'

# stops = []
# path = './stopwords'
# for filename in os.listdir(path):
#     with open(os.path.join(path, filename).encode('utf-8')) as f:
#         line = f.readline()
#         stop = [i.strip() for i in line.split(',')]
#         stops = stops + stop

# stops = stops + list(stopwords.words('english'))
stops = list(stopwords.words('english'))

user_info = []
print('start loading and process data...')
id_url_count_list = []
id_at_count_list = []
id_len_list = []
with open(os.path.join(data_dir, tweets).encode('utf-8'),'r') as f:
    for i, line in enumerate(f):
        tweet_obj = line.strip().split('::::')
        uid = tweet_obj[0]
        tweet = tweet_obj[1]
        created_at = tweet_obj[2]
        quote_count = tweet_obj[3]
        retweet_count = tweet_obj[4]
        favorite_count = tweet_obj[5]
        favorited = tweet_obj[6]
        retweeted = tweet_obj[7]
        entities = tweet_obj[8]

        id_url_count = uid+','+str(detect_url(tweet))
        id_url_count_list.append(id_url_count)
        
        id_at_count = uid+','+str(detect_at_user(tweet))
        id_at_count_list.append(id_at_count)
        
        cleaned_text = pre_process(tweet)

        id_len_tweet = uid+','+str(len(cleaned_text))
        id_len_list.append(id_len_tweet)

fout = open(os.path.join(data_dir, 'tweets_url_count.txt').encode('utf-8'), 'w')
for i in id_url_count_list:
    fout.write('%s\n' %i)
fout.close()

fout = open(os.path.join(data_dir, 'tweets_at_count.txt').encode('utf-8'), 'w')
for i in id_at_count_list:
    fout.write('%s\n' %i)
fout.close()

fout = open(os.path.join(data_dir, 'tweets_len.txt').encode('utf-8'), 'w')
for i in id_len_list:
    fout.write('%s\n' %i)
fout.close()
print('Process completed.')

start loading and process data...


In [45]:
## filter out suspecious users
df = pd.read_csv('./users with tweets before filtering/tweets_url_count.txt',header=None,names=['id','url_ratio'])
df_grouped = df.groupby('id', as_index=False).count()
df_grouped = df_grouped.loc[df_grouped['url_ratio']>80] # users with more than 80 tweets
after_count_list = df_grouped['id'].tolist()

df = pd.read_csv('./users with tweets before filtering/tweets_url_count.txt',header=None,names=['id','url_ratio'])
df_grouped = df.groupby('id', as_index=False).mean()
df_grouped = df_grouped.loc[df_grouped['url_ratio']<0.5] # users with less than 0.5 url ratio
after_url_list = df_grouped['id'].tolist()

df = pd.read_csv('./users with tweets before filtering/tweets_at_count.txt',header=None,names=['id','at_ratio'])
df_grouped = df.groupby('id', as_index=False).mean()
df_grouped = df_grouped.loc[df_grouped['at_ratio']<0.8] # users with less than 0.8 at ratio
after_at_list = df_grouped['id'].tolist()

df = pd.read_csv('./users with tweets before filtering/tweets_len.txt',header=None,names=['id','word_count'])
df_grouped = df.groupby('id', as_index=False).mean()
df_grouped = df_grouped.loc[df_grouped['word_count']>7] # users with less than 7 average number of meaningful words 
after_wordcount_list = df_grouped['id'].tolist()


set0 = set(after_url_list).intersection(after_at_list)
set0 = set0.intersection(after_wordcount_list)
set0 = set0.intersection(after_count_list)
final_user_list = list(set0)

print len(final_user_list)

fout = open(os.path.join(data_dir, 'filtered_user_part1.txt').encode('utf-8'), 'w')
for i in final_user_list:
    i = str(i).replace('L','')
    fout.write('%s\n' %i)
fout.close()

286


## LDA: Tweets-based recommendation

First, we train our model by all the tweets we have (from seed user and their friends and followers)

In [23]:
data_dir = './final data'

In [39]:
# conbine and pre_process tweets
tweets = []
for filename in ['seed_user_tweets.txt','followers_tweets.txt','friends_tweets.txt']:
    with open(os.path.join(data_dir, filename).encode('utf-8')) as f:
        for i, line in enumerate(f):
            postprocess_tweet = []
            try:
                tweet = line.strip().split('::::')[1]
            except:
                continue
            tweet = pre_process(tweet)
            if i%10000==0:
                print 'The %sth 10000 iteration'%(i/10000)
            for word in tweet:
                if word not in stops:
                    postprocess_tweet.append(word)
            tweets.append(' '.join(postprocess_tweet))

The 0th 10000 iteration
The 1th 10000 iteration
The 2th 10000 iteration
The 3th 10000 iteration
The 4th 10000 iteration
The 5th 10000 iteration
The 6th 10000 iteration
The 7th 10000 iteration
The 8th 10000 iteration
The 9th 10000 iteration
The 10th 10000 iteration
The 11th 10000 iteration
The 12th 10000 iteration
The 13th 10000 iteration
The 14th 10000 iteration
The 15th 10000 iteration
The 16th 10000 iteration
The 17th 10000 iteration
The 18th 10000 iteration
The 19th 10000 iteration
The 20th 10000 iteration
The 21th 10000 iteration
The 22th 10000 iteration
The 23th 10000 iteration
The 24th 10000 iteration
The 25th 10000 iteration
The 26th 10000 iteration
The 27th 10000 iteration
The 28th 10000 iteration
The 29th 10000 iteration
The 30th 10000 iteration
The 31th 10000 iteration
The 32th 10000 iteration
The 33th 10000 iteration
The 34th 10000 iteration
The 35th 10000 iteration
The 36th 10000 iteration
The 37th 10000 iteration
The 38th 10000 iteration
The 0th 10000 iteration
The 1th 100

In [40]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

def is_not_number(s):
    try:
        float(s)
        return False
    except ValueError:
        return True

In [58]:

# read all tweets of followers and friends
uid_list = []
for filename in ['seed_user_tweets.txt','followers_tweets.txt','friends_tweets.txt']:
    with open(os.path.join(data_dir, filename).encode('utf-8')) as f:
        for i, line in enumerate(f):
            if i%10000==0:
                print 'The %sth 10000 iteration'%(i/10000)
            try:
                tweet = line.strip().split('::::')[1]
            except:
                continue
            uid = line.strip().split('::::')[0]
            uid_list.append(uid)

## save them to a file
fout = open(os.path.join(data_dir, 'tweets_all_processed.txt').encode('utf-8'), 'w')
tweets_new = []
for t,tweet in enumerate(tweets):
    words = tweet.split(' ')
    new = []
    for word in words:
        if isEnglish(word)&is_not_number(word):
            new.append(word)
    new_tweet = ' '.join(new)
    id_tweet = uid_list[t]+'::::'+new_tweet
    fout.write('%s\n' %id_tweet)
fout.close()

The 0th 10000 iteration
The 1th 10000 iteration
The 2th 10000 iteration
The 3th 10000 iteration
The 4th 10000 iteration
The 5th 10000 iteration
The 6th 10000 iteration
The 7th 10000 iteration
The 8th 10000 iteration
The 9th 10000 iteration
The 10th 10000 iteration
The 11th 10000 iteration
The 12th 10000 iteration
The 13th 10000 iteration
The 14th 10000 iteration
The 15th 10000 iteration
The 16th 10000 iteration
The 17th 10000 iteration
The 18th 10000 iteration
The 19th 10000 iteration
The 20th 10000 iteration
The 21th 10000 iteration
The 22th 10000 iteration
The 23th 10000 iteration
The 24th 10000 iteration
The 25th 10000 iteration
The 26th 10000 iteration
The 27th 10000 iteration
The 28th 10000 iteration
The 29th 10000 iteration
The 30th 10000 iteration
The 31th 10000 iteration
The 32th 10000 iteration
The 33th 10000 iteration
The 34th 10000 iteration
The 35th 10000 iteration
The 36th 10000 iteration
The 37th 10000 iteration
The 38th 10000 iteration
The 0th 10000 iteration
The 1th 100

In [64]:
# preparing docs for LDA
x = []
with open(os.path.join(data_dir, 'tweets_all_processed.txt').encode('utf-8')) as f:
    for i, line in enumerate(f):
        tweets = line.strip().split('::::')[1]
        x.append(tweets)
# keep only informative tweets
x = [i for i in x if (len(i.split(' '))>5)]
print len(x)

268928


In [65]:
print("Extracting features...")
x_vect = CountVectorizer(max_df=0.95, min_df=1,max_features = 10000,stop_words='english')
x_feats = x_vect.fit_transform(x)
x_feats_names = x_vect.get_feature_names()
print(x_feats.shape)

Extracting features...
(268928, 10000)


In [66]:
n_components  = 10

# Run LDA
lda = LatentDirichletAllocation(n_components=n_components, 
                                max_iter=5, 
                                learning_method='online', 
                                learning_offset=50.,random_state=0)
# train LDA
lda = lda.fit(x_feats)

# display the words distribution among topics 
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print "Topic %d:" % (topic_idx+1)
        print " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])

no_top_words = 20
display_topics(lda, x_feats_names, no_top_words)

Topic 0:
amazing family working friends friend games home birthday single talking business person weve fact members hair matter bc record honor
Topic 1:
video trump season team story media pay american office michael artist space breaking action john app director trumps justice fbi
Topic 2:
women hes ill president men id fight human children gt car baby hot starting sex law bitch hell united china
Topic 3:
ive community party job public chance police lost film facebook email london data campaign kid student key loved stories eyes
Topic 4:
favorite city song kids power gun photo code movie dm phone youve league sale uk syria shop shooting meeting texas
Topic 5:
game ur fans national shot weekend todays star class james west players information dog stream instagram running science cardi skin
Topic 6:
na gon coming students wan giving college king body group living ta youll nba points thoughts reading security research room
Topic 7:
amp world book history future series war player country 

**Second**, we predict for each user (seed, friends or followers), what's the topic distribution of him or her and save it as a vector(topic0:prob0; topic1:prob1; ...)

In [102]:
# define function for lda prediction, the output is the topic probability distribution of each user
new_vect = CountVectorizer(vocabulary=x_feats_names)
def predict_lda(group_of_tweets):
    new_feats = new_vect.fit_transform(group_of_tweets)
    predict = lda.transform(new_feats)
    pred = predict.mean(axis=0)
    return list(pred)

Calculate topic distribution of **seed** users

In [109]:
data_dir = './final data'  ##Setting your own file path here.
tweets = 'tweets_all_processed.txt'

print('start loading and process data...')

tweets_group = []
topic_distribution_list = []
uid0 = '776681696624345088'
left_out = 0
with open(os.path.join(data_dir, tweets).encode('utf-8'),'r') as f:
    for i, line in enumerate(f):
        tweet_obj = line.strip().split('::::')
        uid = tweet_obj[0]
        tweet = tweet_obj[1]
        
#         # remove empty tweets
#         if tweet =='':
#             left_out+=1
#             continue
#         else:
#             pass
        
        if i%100000==0:
            print 'The %sth 100000 iteration'%(i/100000)
            
        if uid == uid0:
            tweets_group.append(tweet)
        else:
            
            # keep users with more than 50 valid tweets
            if len(tweets_group)>2:
                pass
            else:
                left_out+=1
                uid0=uid
                tweets_group = []
                continue
                
            topic_distribution = predict_lda(tweets_group)
            id_topic_distribution = uid0+'::::'+json.dumps(topic_distribution)
            topic_distribution_list.append(id_topic_distribution)
            
            uid0=uid
            tweets_group = []
        


fout = open(os.path.join(data_dir, 'seed_user_distribution.txt').encode('utf-8'), 'w')# from 5304: followers; from 8652: friends
for i in topic_distribution_list[:5303]:
    fout.write('%s\n' %i)
fout.close()

fout = open(os.path.join(data_dir, 'follower_distribution.txt').encode('utf-8'), 'w')# from 5304: followers; from 8652: friends
for i in topic_distribution_list[5304:8651]:
    fout.write('%s\n' %i)
fout.close()

fout = open(os.path.join(data_dir, 'friend_distribution.txt').encode('utf-8'), 'w')# from 5304: followers; from 8652: friends
for i in topic_distribution_list[8652:]:
    fout.write('%s\n' %i)
fout.close()

print('Process completed.')

start loading and process data...
The 0th 100000 iteration
The 1th 100000 iteration
The 2th 100000 iteration
The 3th 100000 iteration
The 4th 100000 iteration
The 5th 100000 iteration
The 6th 100000 iteration
The 7th 100000 iteration
The 8th 100000 iteration
The 9th 100000 iteration
The 10th 100000 iteration
The 11th 100000 iteration
Process completed.


In [110]:
len(topic_distribution_list)

13699

In [111]:
left_out

392

**Third**, calculate similarity between each user and their friends and followers, output the top 10 most similar users

In [4]:
def calculate_similarity(vect1,vect2):
    result = 1 - spatial.distance.cosine(vect1,vect2)
    return result

Take 7 users as example: 836758265824423936, 2677264639,  809056262432391168, 3161727481,  3260137987, 53185280, 833064800

In [95]:
# look up follower and friend id from tables
input_uid = '2677264639'
import pandas as pd
followers = pd.read_pickle('./final data/followers_filtered')
friends = pd.read_pickle('./final data/friends_filtered')
follower_ids = followers.loc[followers['seed_user']==int(input_uid)]['idstr']
follower_ids = follower_ids.tolist()
follower_ids = [str(i).replace('L','') for i in follower_ids]

friend_ids = friends.loc[friends['seed_user']==int(input_uid)]['idstr']
friend_ids = friend_ids.tolist()
friend_ids = [str(i).replace('L','') for i in friend_ids]

Calculate the accuracy of similarity based classification (recommend when similarity higher than 0.98)

In [96]:
data_dir = './final data'
with open(os.path.join(data_dir, 'seed_user_distribution.txt').encode('utf-8')) as f:
    for i, line in enumerate(f):
        obj = line.strip().split('::::')
        uid = obj[0]
        seed_vect = json.loads(obj[1])
        if str(uid) == str(input_uid):
            print i
            break
        else:
            pass

# these are users that follow the seed user but are not followed back, which we believe are the type we shouldn't recommend
follower_similarity_dict = {}
with open(os.path.join(data_dir, 'follower_distribution.txt').encode('utf-8')) as f:
    for i, line in enumerate(f):
        obj = line.strip().split('::::')
        uid = obj[0]
        if uid in follower_ids:
            pass
        else:
            continue
        follower_vect = json.loads(obj[1])
        sim = calculate_similarity(seed_vect,follower_vect)
        follower_similarity_dict[uid] = sim

# these are users that are followed the seed user but not follow back, which we believe are the type we should recommend     
friend_similarity_dict = {}
with open(os.path.join(data_dir, 'friend_distribution.txt').encode('utf-8')) as f:
    for i, line in enumerate(f):
        obj = line.strip().split('::::')
        uid = obj[0]
        if uid in friend_ids:
            pass
        else:
            continue
        friend_vect = json.loads(obj[1])
        sim = calculate_similarity(seed_vect,friend_vect)
        friend_similarity_dict[uid] = sim

sorted_followers = sorted(follower_similarity_dict.items(), key=operator.itemgetter(1), reverse=True)
print sorted_followers

sorted_friends = sorted(friend_similarity_dict.items(), key=operator.itemgetter(1), reverse=True)
print sorted_friends

try:
    recommend = sorted_friends[:9]
except:
    recommend = sorted_friends

recommend_id_list = []
for r in recommend:
    recommend_id_list.append(r[0])
recommend_id_list # recommend users from friend list and follower list

3370
[('72665173', 0.9855692580977835), ('2938100399', 0.9784753772737028), ('625283075', 0.977270998435106), ('3631362339', 0.9753947524029087), ('573181047', 0.9715385166391677), ('3290208761', 0.9658643345438105), ('174930501', 0.9657654560480828), ('2369696214', 0.9626900182095447), ('19091405', 0.9610259221838785), ('4901673680', 0.9552355764649323), ('814418096', 0.9422272307231158), ('3378043343', 0.9376650785263239), ('499202239', 0.9303848876131111), ('33612317', 0.9222368697976674), ('1014178490', 0.9130825632674969), ('32712325', 0.8323133274344572)]
[('1400748608', 0.9831191151903111), ('783108378742562816', 0.9809193069965703), ('1056487393', 0.9790007662277302), ('181572333', 0.9683051825448191), ('3950477674', 0.9661999741485021), ('95023423', 0.9609497105212569), ('112540334', 0.9576612658557357), ('78525538', 0.9553211649683933), ('866953267', 0.9344377926063014), ('152457403', 0.9018429921867455)]


['1400748608',
 '783108378742562816',
 '1056487393',
 '181572333',
 '3950477674',
 '95023423',
 '112540334',
 '78525538',
 '866953267']

In [97]:
all_similarity_dict = {}
with open(os.path.join(data_dir, 'all_topic_distribution.txt').encode('utf-8')) as f:
    for i, line in enumerate(f):
        obj = line.strip().split('::::')
        uid = obj[0]
        all_vect = json.loads(obj[1])
        sim = calculate_similarity(seed_vect,all_vect)
        all_similarity_dict[uid] = sim

sorted_all = sorted(all_similarity_dict.items(), key=operator.itemgetter(1), reverse=True)
print sorted_all[:9]

try:
    recommend = sorted_all[:9]
except:
    recommend = sorted_all

recommend_id_list = []
for r in recommend:
    recommend_id_list.append(r[0])
recommend_id_list # recommend users from full candidate list

[('2677264639', 1.0), ('939380029', 0.9966474420774795), ('513917014', 0.9955926016126856), ('1932382154', 0.9955440854063033), ('1692679680', 0.9948573151752046), ('527754604', 0.9947138747558425), ('946564248', 0.994358004234309), ('614053008', 0.9943477482259695), ('621358322', 0.994105343837573)]


['2677264639',
 '939380029',
 '513917014',
 '1932382154',
 '1692679680',
 '527754604',
 '946564248',
 '614053008',
 '621358322']

## Profile-based recommendation

**Predict whether a seed user will follow another user**

seed_user = [392488192,45133149,376987007,22513648,3718133833,30140207,361067987,35832236,896663059]

In [206]:
names = ['seed_user_id','id','name','verified','statuses_cnt','followers_cnt','friends_cnt','listed_cnt','favourites_cnt','description']
followers = pd.read_csv('./final data/big train/bigtrain_followers.txt',sep='::::',header=None,names=names,engine='python')

names = ['seed_user_id','id','name','verified','statuses_cnt','followers_cnt','friends_cnt','listed_cnt','favourites_cnt','description']
friends = pd.read_csv('./final data/big train/bigtrain_friends.txt',sep='::::',header=None,names=names,engine='python')

names = ['id','name','verified','statuses_cnt','followers_cnt','friends_cnt','listed_cnt','favourites_cnt','description']
seed_users = pd.read_csv('./final data/seed_user_profile.txt',sep='::::',header=None,names=names,engine='python')

followers['label'] = 0
friends['label'] = 1 
print followers.shape,friends.shape,seed_users.shape

(522, 11) (1142, 11) (575, 9)


In [235]:
input_uid = 896663059
followers_input = followers.loc[followers['seed_user_id']==input_uid]
friends_input = friends.loc[friends['seed_user_id']==input_uid]
print followers_input.shape, friends_input.shape

(44, 11) (9, 11)


In [236]:
# followers_input = pd.merge(followers_input,followers_group)
# friends_input = pd.merge(friends_input,friends_group)
# print followers_input.shape, friends_input.shape

Process df: 1. concatenate 2 df; 2. feature engineering; 3. transform into matrix for sklearn

In [237]:
## labeling
input_df = pd.concat([followers_input, friends_input], axis=0)
print input_df.shape,input_df.columns

input_df_new = input_df.drop(['seed_user_id','id','name','description'], axis=1)
print input_df_new.shape,input_df_new.columns

input_array = np.array(input_df_new.iloc[:,:6])
y = np.array(input_df_new['label'])

(53, 11) Index([u'seed_user_id', u'id', u'name', u'verified', u'statuses_cnt',
       u'followers_cnt', u'friends_cnt', u'listed_cnt', u'favourites_cnt',
       u'description', u'label'],
      dtype='object')
(53, 7) Index([u'verified', u'statuses_cnt', u'followers_cnt', u'friends_cnt',
       u'listed_cnt', u'favourites_cnt', u'label'],
      dtype='object')


In [238]:
print("Start training and predict...")
kf = KFold(n_splits=5,shuffle=True) ## function to spilt train and test data set
avg_p = 0
avg_r = 0
avg_a = 0
for train, test in kf.split(input_array):
    model = LogisticRegression().fit(input_array[train], y[train])
    predicts = model.predict(input_array[test])
#     print(classification_report(y[test],predicts))
    avg_p += precision_score(y[test],predicts, average='macro',pos_label=1)
    avg_r += recall_score(y[test],predicts, average='macro',pos_label=1)
    avg_a += accuracy_score(y[test],predicts)

print('Average Precision of svm is %f.' %(avg_p/5))
print('Average Recall of svm is %f.' %(avg_r/5))
print('Average Accuracy of svm is %f.' %(avg_a/5))

Start training and predict...
Average Precision of svm is 0.710707.
Average Recall of svm is 0.752778.
Average Accuracy of svm is 0.849091.


Predict on all users by this model

In [200]:
input_test = pd.concat([followers.iloc[:,1:9], friends.iloc[:,1:9], seed_users], axis=0)
input_test_new = input_test.drop(['id','name','description'], axis=1)
print input_test_new.shape

x_feats_all = np.array(input_test_new)

(2239, 6)


In [201]:
# final model
model_final = LogisticRegression().fit(input_array, y)
predicts = model_final.predict(x_feats_all)
prob = model_final.predict_proba(x_feats_all)
print model_final.classes_, prob

prob_recommend = prob[:,1].tolist()
prob_recommend.sort(reverse=True)
prob_recommend_top = prob_recommend[:10]
prob_recommend = prob[:,1].tolist()

prob_series = pd.Series(prob_recommend)
input_test['prob'] = prob_series
input_test

[0 1] [[0.00000000e+00 1.00000000e+00]
 [0.00000000e+00 1.00000000e+00]
 [4.66200363e-04 9.99533800e-01]
 ...
 [0.00000000e+00 1.00000000e+00]
 [0.00000000e+00 1.00000000e+00]
 [0.00000000e+00 1.00000000e+00]]


Unnamed: 0,description,favourites_cnt,followers_cnt,friends_cnt,id,listed_cnt,name,statuses_cnt,verified,prob
0,,1602,625,373,856264641722195970,3,punflower,797,False,1.000000
1,,5254,70576,11877,722210815110041600,367,HandmaidsOnHulu,2992,True,1.000000
2,,149,6180,3786,922634896207687685,2,vaakoh,154,False,0.999534
3,,392,465,374,3837479480,0,lift_momentum,243,False,0.999988
4,,12654,2362,1707,361695170,135,JenniNexus,7788,False,1.000000
5,,32170,3536,2528,725350782497906688,181,RoanokeMaven,11622,False,1.000000
6,,33396,102131,45311,1584118603,253,FromAshestoNew,5981,True,1.000000
7,,12851,1374,754,169604573,61,Lee_G_Malone,9931,False,1.000000
8,,51,99,53,703927794150055936,6,UncappingBadger,666,False,1.000000
9,,17408,13840,1689,90995027,86,samzorz,16467,False,1.000000


In [202]:
recommmend_list = input_test[input_test['prob'].isin(prob_recommend_top)][:10]
recommmend_list

Unnamed: 0,description,favourites_cnt,followers_cnt,friends_cnt,id,listed_cnt,name,statuses_cnt,verified,prob
0,,1602,625,373,856264641722195970,3,punflower,797,False,1.0
1,,5254,70576,11877,722210815110041600,367,HandmaidsOnHulu,2992,True,1.0
4,,12654,2362,1707,361695170,135,JenniNexus,7788,False,1.0
5,,32170,3536,2528,725350782497906688,181,RoanokeMaven,11622,False,1.0
6,,33396,102131,45311,1584118603,253,FromAshestoNew,5981,True,1.0
7,,12851,1374,754,169604573,61,Lee_G_Malone,9931,False,1.0
9,,17408,13840,1689,90995027,86,samzorz,16467,False,1.0
10,,1277,6205,2200,28719244,106,bunnyXablaze,17950,False,1.0
11,,6796,4134,931,2265419863,18,Tentanman_,4762,False,1.0
12,,3144,18371,15220,173631389,427,lowcarbyum,27213,False,1.0
