In [0]:
import numpy as np
import pandas as pd
pd.set_option('use_inf_as_na', True)
import csv
from datetime import datetime
import re

In [0]:
# load in tweets data for training
path_to_file_2 = "Final datasets/users_tweets/"
df = pd.read_csv(path_to_file_2+"API_tweets_homelessness_for_features.csv")

In [0]:
# swear words prevalence
file = open("swearwords.txt","r")
swearwords = []
for line in file:
    swearwords.append(line)
swearwords = list(map(lambda each:each.strip("\n"), swearwords))

In [0]:
# polite/customer service phrases
file = open("customerservice.txt","r")
polite = []
for line in file:
    polite.append(line)
polite = list(map(lambda each:each.strip("\n"), polite))

In [0]:
# r/4chan negative sentiment phrases
file = open("4chan.txt","r")
chan = []
for line in file:
    if len(line) > 2:
        chan.append(line)
chan = list(map(lambda each:each.strip("\n"), chan))

In [0]:
# tokenise tweets

import preprocessor as p
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords

df.text = df.text.astype(str)

punctuation = string.punctuation
other = '“¡…'
punctuation2 = punctuation + other
stop_words = stopwords.words('english')
    
tokens = []
cleaned = []

df['text'] = df['text'].str.casefold()

for i in df['text']:
    # clean drops items mentions, hasthags, hyperlinks, emojis etc
    clean = p.clean(i)
    # token seperates tweets into lists of words
    token = word_tokenize(clean)
    # words removes punctuated items, empty items and stopwords
    words = [''.join(c for c in s if c not in punctuation2) for s in token]
    words = [s for s in words if len(s) > 1]
    words = [w for w in words if not w in stop_words]
    sentence = ' '.join(word for word in words)
    tokens.append(words)
    cleaned.append(sentence)

df['tokens'] = tokens
df['text_clean'] = cleaned

In [0]:
# occurences per tweet of phrases
matches = []
for i in df.tokens:
    match = len(list(set(i) & set(swearwords)))
    matches.append(match)
df['swears'] = matches

matches2 = []
for i in df.tokens:
    match = len(list(set(i) & set(polite)))
    matches2.append(match)
df['polite'] = matches2

matches3 = []
for i in df.tokens:
    match = len(list(set(i) & set(chan)))
    matches3.append(match)
df['fourchan'] = matches3

df.tail()

Unnamed: 0.1,Unnamed: 0,date,time,screen_name,text,tweetid,datetime,retweet_count,favourite_count,tokens,text_clean,swears
2166231,2166265,2020-01-05,17:04:46,zyshanaryf,rt @aoc: this is a war crime.\n\nthreatening t...,1.213869e+18,2020-01-05 17:04:46,96615.0,0.0,"[rt, war, crime, threatening, target, kill, in...",rt war crime threatening target kill innocent ...,0
2166232,2166266,2020-01-05,17:04:15,zyshanaryf,rt @kaepernick7: there is nothing new about am...,1.213869e+18,2020-01-05 17:04:15,22924.0,0.0,"[rt, nothing, new, american, terrorist, attack...",rt nothing new american terrorist attacks blac...,0
2166233,2166267,2020-01-05,13:16:21,zyshanaryf,texans. titans. saints. seahawks.\n\n2 of 4 so...,1.213811e+18,2020-01-05 13:16:21,0.0,0.0,"[texans, titans, saints, seahawks, far]",texans titans saints seahawks far,0
2166234,2166268,2020-01-05,01:13:31,zyshanaryf,@nfluk @deshaunwatson this dude is a born winn...,1.21363e+18,2020-01-05 01:13:31,0.0,0.0,"[dude, born, winner, utterly, absurd, continua...",dude born winner utterly absurd continually wi...,0
2166235,2166269,2020-01-05,01:12:24,zyshanaryf,rt @nfluk: speechless.\n\n@deshaunwatson... wo...,1.213629e+18,2020-01-05 01:12:24,219.0,0.0,"[rt, speechless, wow]",rt speechless wow,0


In [0]:
# mean prevalence by users
users = df.groupby('screen_name').mean()
users = users.iloc[:,1:].copy()
users

Unnamed: 0_level_0,tweetid,retweet_count,favourite_count,swears,polite,fourchan
screen_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
008Mi7,1.244086e+18,214.785000,0.070000,0.005000,0.080000,0.390000
01iver5haw,1.238395e+18,3750.834171,0.758794,0.005025,0.065327,0.311558
02pash,1.249757e+18,10.490000,0.570000,0.015000,0.115000,0.205000
05_noodle,1.247299e+18,1164.792746,0.518135,0.077720,0.025907,0.165803
0644labella,1.250203e+18,0.010000,0.200000,0.015000,0.185000,0.455000
...,...,...,...,...,...,...
zuma_khulubuse,1.253582e+18,100.610000,2.210000,0.040000,0.055000,0.440000
zumayabooks,1.254236e+18,37.025000,1.245000,0.015000,0.080000,0.550000
zwideservesbass,1.254651e+18,210.685000,0.935000,0.085000,0.050000,0.220000
zwitterion_work,1.253758e+18,2447.980000,0.140000,0.050000,0.075000,0.275000


In [0]:
# include log versions
log_swears,log_polite,log_fourchan  = users.swears.apply(np.log).fillna(0),users.polite.apply(np.log).fillna(0),users.fourchan.apply(np.log).fillna(0)
users['log_swears'],users['log_polite'],users['log_fourchan'] = log_swears,log_polite,log_fourchan
users

Unnamed: 0_level_0,tweetid,retweet_count,favourite_count,swears,polite,fourchan,log_swears,log_polite,log_fourchan
screen_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
008Mi7,1.244086e+18,214.785000,0.070000,0.005000,0.080000,0.390000,-5.298317,-2.525729,-0.941609
01iver5haw,1.238395e+18,3750.834171,0.758794,0.005025,0.065327,0.311558,-5.293305,-2.728355,-1.166170
02pash,1.249757e+18,10.490000,0.570000,0.015000,0.115000,0.205000,-4.199705,-2.162823,-1.584745
05_noodle,1.247299e+18,1164.792746,0.518135,0.077720,0.025907,0.165803,-2.554640,-3.653252,-1.796954
0644labella,1.250203e+18,0.010000,0.200000,0.015000,0.185000,0.455000,-4.199705,-1.687399,-0.787458
...,...,...,...,...,...,...,...,...,...
zuma_khulubuse,1.253582e+18,100.610000,2.210000,0.040000,0.055000,0.440000,-3.218876,-2.900422,-0.820981
zumayabooks,1.254236e+18,37.025000,1.245000,0.015000,0.080000,0.550000,-4.199705,-2.525729,-0.597837
zwideservesbass,1.254651e+18,210.685000,0.935000,0.085000,0.050000,0.220000,-2.465104,-2.995732,-1.514128
zwitterion_work,1.253758e+18,2447.980000,0.140000,0.050000,0.075000,0.275000,-2.995732,-2.590267,-1.290984


In [0]:
# tweets during office hours
screen_names = users.index.to_list()

In [0]:
gp = df.groupby('screen_name')['tweetid'].count()
gp.sort_values(0).tail(11)

screen_name
FATbaddieonabu1    200
TVTalkWithJWalk    200
TTaesarang         200
FBACOMMITTEE       200
FBrownWilliams     200
TSpeth5            200
FCN2go             200
TSARidgeMeadows    200
TRVST_Poverty      200
ExpertLink2019     200
008Mi7             200
Name: tweetid, dtype: int64

In [0]:
# tweet frequency by office hours for popular timezones - UTC based
office_hours = []

for i in screen_names:
    
    # slice by username
    test = pd.DataFrame(df.loc[df.screen_name == i])
    test['time'] = pd.to_datetime(test.time, format='%H:%M:%S')
    test['hour'] = test['time'].dt.hour
    
    # pivot by date and hour of tweet
    table = pd.pivot_table(test, values=['tweetid'], index=['date'],
                        columns=['hour'], aggfunc='count',fill_value=0)
   
    # formatting
    table2 = table.reset_index()
    table2.columns = table2.columns.droplevel()
    table2.rename(columns={table2.columns[0]: "date" }, inplace = True)
    
    # UTC hours - but not much we can do with timezone...
    nine_to_five_utc = [9,10,11,12,13,14,15,16,17] # gmt UK
    nine_to_five_est = [5,6,7,8,9,10,11,12,13] # east coast US
    nine_to_five_pct = [2,3,4,5,6,7,8,9,10] # west coast US
    nine_to_five_mst = [12,13,14,15,16,17,18,19,20]# Moscow standard
    nine_to_five_aus = [19,20,21,22,23,0,1,2,3] # australia east coast    
    
    # sum the amount of tweets between hours 9-5
    total = np.array(np.sum(table2.iloc[:,1:],axis=1))
    office_utc = np.array(np.sum(table2.reindex(columns = nine_to_five_utc, fill_value=0),axis=1))
    office_est = np.array(np.sum(table2.reindex(columns = nine_to_five_est, fill_value=0),axis=1))
    office_pct = np.array(np.sum(table2.reindex(columns = nine_to_five_pct, fill_value=0),axis=1))
    office_mst = np.array(np.sum(table2.reindex(columns = nine_to_five_mst, fill_value=0),axis=1))
    office_aus = np.array(np.sum(table2.reindex(columns = nine_to_five_aus, fill_value=0),axis=1))

    # return the percentage for each day and take average - can do either way..
    timezones =  [office_utc,office_est, office_pct,office_mst,office_aus]
    means = []
    
    for i in timezones:       
        mean_hours = np.round((np.sum(i)/np.sum(total)),decimals=3)
        means.append(mean_hours)
        
    office_hours.append(means)

office_hours_df = pd.DataFrame(office_hours,columns=['utc','est','pct','mst','aus'])
office_hours_df['username'] = screen_names
office_hours_df = office_hours_df[['username','utc','est','pct','mst','aus']]


In [0]:
office_hours_df.columns = ['username','office_hours_utc','office_hours_est',
                           'office_hours_pct','office_hours_mst','office_hours_aus']

office_hours_df

In [0]:
accounts_train = accounts_train.merge(users,how='left',left_on='username',right_on=users.index)
accounts_train = accounts_train.drop(columns=['tweetid','retweet_count','favourite_count'])

In [0]:
accounts_train = accounts_train.merge(office_hours_df,how='left',left_on='username',right_on='username')
accounts_train.head()

Unnamed: 0,id,name,username,location,url,description,verified,followers,friends,favourites_count,...,polite,fourchan,log_swears,log_polite,log_fourchan,office_hours_utc,office_hours_est,office_hours_pct,office_hours_mst,office_hours_aus
0,1238173275743805440,0008MI:8 REDBULL CODENAME TARSOC | ANCIENT 1 O...,008Mi7,"Kington Langley, England",https://t.co/XkOZohK2d3,I'm a blown cover agent protecting World secur...,False,22,722,175,...,0.08,0.39,-5.298317,-2.525729,-0.941609,0.535,0.165,0.195,0.645,0.33
1,1113034041408806912,oliver s,01iver5haw,United Kingdom,https://t.co/UaeTsGvWJQ,history @ oxford • midlander,False,73,327,2940,...,0.065327,0.311558,-5.293305,-2.728355,-1.16617,0.462,0.251,0.111,0.558,0.427
2,753578136,Niall Love,02pash,,,To fight and support for the rights of the ho...,False,362,288,7928,...,0.115,0.205,-4.199705,-2.162823,-1.584745,0.615,0.605,0.385,0.55,0.095
3,1041061771723907072,Fish,05_noodle,he/him!!!!,,Kyle is everything Harry Potter wishes he was ...,False,55,193,27565,...,0.025907,0.165803,-2.55464,-3.653252,-1.796954,0.057,0.161,0.363,0.228,0.637
4,249481775,Da,0644labella,,,,False,13,78,1191,...,0.185,0.455,-4.199705,-1.687399,-0.787458,0.245,0.02,0.07,0.515,0.685


In [0]:
# save as csv

In [0]:
accounts_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11353 entries, 0 to 11352
Data columns (total 42 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              11353 non-null  int64  
 1   name                            11352 non-null  object 
 2   username                        11353 non-null  object 
 3   location                        8578 non-null   object 
 4   url                             4814 non-null   object 
 5   description                     10198 non-null  object 
 6   verified                        11353 non-null  bool   
 7   followers                       11353 non-null  int64  
 8   friends                         11353 non-null  int64  
 9   favourites_count                11353 non-null  int64  
 10  statuses_count                  11353 non-null  int64  
 11  default_profile                 11353 non-null  int64  
 12  default_profile_image           

In [0]:
# load in datasets and merge with new data

In [0]:
stefan_training = pd.read_csv("Final datasets/content_data/homelessness.csv")
training = accounts_train
stefan_training.shape

(11274, 37)

In [0]:
training = training.merge(stefan_training,how="left",left_on="username",right_on="screen_name")

In [0]:
training.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11353 entries, 0 to 11352
Data columns (total 79 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              11353 non-null  int64  
 1   name                            11352 non-null  object 
 2   username                        11353 non-null  object 
 3   location                        8578 non-null   object 
 4   url                             4814 non-null   object 
 5   description                     10198 non-null  object 
 6   verified                        11353 non-null  bool   
 7   followers                       11353 non-null  int64  
 8   friends                         11353 non-null  int64  
 9   favourites_count                11353 non-null  int64  
 10  statuses_count                  11353 non-null  int64  
 11  default_profile                 11353 non-null  int64  
 12  default_profile_image           

In [0]:
training = training.drop(columns='screen_name')


In [0]:
aris_climate = pd.read_csv("Final datasets/business_specific/business_users_homelessness.csv")
aris_climate

Unnamed: 0.1,Unnamed: 0,id,username,url_titles,username_urltitle_simimlarity,username_in_urltitle,username_name_similarity,username_in_bio,lower_userid,followers_friends_ratio,bio_sentiment
0,0,1238173275743805440,008Mi7,National Security Agency,0.066667,False,0.178571,False,False,0.030471,negative
1,1,1113034041408806912,01iver5haw,oliver shaw. – blog. couldn't think of a bette...,0.225806,False,0.444444,False,False,0.223242,neutral
2,2,753578136,02pash,,,,0.125000,False,True,1.256944,negative
3,3,1041061771723907072,05_noodle,,,,0.000000,False,False,0.284974,neutral
4,4,249481775,0644labella,,,,0.153846,False,True,0.166667,neutral
...,...,...,...,...,...,...,...,...,...,...,...
11348,11348,1192842602086895617,zuma_khulubuse,,,,0.692308,False,False,20.327485,neutral
11349,11349,14948080,zumayabooks,Liz Burton's Portable Soup | “We live not for ...,0.119205,False,0.109091,False,True,0.866990,negative
11350,11350,216353292,zwideservesbass,Better Late EP by Zwide Ndwandwe | Free Listen...,0.205128,False,0.903226,False,True,1.451482,positive
11351,11351,44130732,zwitterion_work,,0.222222,False,0.277778,False,True,0.686211,neutral


In [0]:
aris_climate = aris_climate.iloc[:,1:]
aris_climate

Unnamed: 0,id,username,url_titles,username_urltitle_simimlarity,username_in_urltitle,username_name_similarity,username_in_bio,lower_userid,followers_friends_ratio,bio_sentiment
0,1238173275743805440,008Mi7,National Security Agency,0.066667,False,0.178571,False,False,0.030471,negative
1,1113034041408806912,01iver5haw,oliver shaw. – blog. couldn't think of a bette...,0.225806,False,0.444444,False,False,0.223242,neutral
2,753578136,02pash,,,,0.125000,False,True,1.256944,negative
3,1041061771723907072,05_noodle,,,,0.000000,False,False,0.284974,neutral
4,249481775,0644labella,,,,0.153846,False,True,0.166667,neutral
...,...,...,...,...,...,...,...,...,...,...
11348,1192842602086895617,zuma_khulubuse,,,,0.692308,False,False,20.327485,neutral
11349,14948080,zumayabooks,Liz Burton's Portable Soup | “We live not for ...,0.119205,False,0.109091,False,True,0.866990,negative
11350,216353292,zwideservesbass,Better Late EP by Zwide Ndwandwe | Free Listen...,0.205128,False,0.903226,False,True,1.451482,positive
11351,44130732,zwitterion_work,,0.222222,False,0.277778,False,True,0.686211,neutral


In [0]:
training = training.merge(aris_climate,how='left',left_on='id',right_on='id')

In [0]:
training.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11353 entries, 0 to 11352
Data columns (total 87 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              11353 non-null  int64  
 1   name                            11352 non-null  object 
 2   username_x                      11353 non-null  object 
 3   location                        8578 non-null   object 
 4   url                             4814 non-null   object 
 5   description                     10198 non-null  object 
 6   verified                        11353 non-null  bool   
 7   followers                       11353 non-null  int64  
 8   friends                         11353 non-null  int64  
 9   favourites_count                11353 non-null  int64  
 10  statuses_count                  11353 non-null  int64  
 11  default_profile                 11353 non-null  int64  
 12  default_profile_image           

In [0]:
training = training.drop(columns='followers_ratio')

In [0]:
training.username_in_urltitle = training.username_in_urltitle*1
training.username_in_bio = training.username_in_bio*1
training.lower_userid = training.lower_userid*1
training.head()

Unnamed: 0,id,name,username_x,location,url,description,verified,followers,friends,favourites_count,...,hash_PT,username_y,url_titles,username_urltitle_simimlarity,username_in_urltitle,username_name_similarity,username_in_bio,lower_userid,followers_friends_ratio,bio_sentiment
0,1238173275743805440,0008MI:8 REDBULL CODENAME TARSOC | ANCIENT 1 O...,008Mi7,"Kington Langley, England",https://t.co/XkOZohK2d3,I'm a blown cover agent protecting World secur...,False,22,722,175,...,0.25,008Mi7,National Security Agency,0.066667,0.0,0.178571,0,0,0.030471,negative
1,1113034041408806912,oliver s,01iver5haw,United Kingdom,https://t.co/UaeTsGvWJQ,history @ oxford • midlander,False,73,327,2940,...,0.125628,01iver5haw,oliver shaw. – blog. couldn't think of a bette...,0.225806,0.0,0.444444,0,0,0.223242,neutral
2,753578136,Niall Love,02pash,,,To fight and support for the rights of the ho...,False,362,288,7928,...,0.335,02pash,,,,0.125,0,1,1.256944,negative
3,1041061771723907072,Fish,05_noodle,he/him!!!!,,Kyle is everything Harry Potter wishes he was ...,False,55,193,27565,...,0.145078,05_noodle,,,,0.0,0,0,0.284974,neutral
4,249481775,Da,0644labella,,,,False,13,78,1191,...,0.0,0644labella,,,,0.153846,0,1,0.166667,neutral


In [0]:
training = pd.get_dummies(training, columns=['bio_sentiment'])

In [0]:
training.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11353 entries, 0 to 11352
Data columns (total 88 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              11353 non-null  int64  
 1   name                            11352 non-null  object 
 2   username_x                      11353 non-null  object 
 3   location                        8578 non-null   object 
 4   url                             4814 non-null   object 
 5   description                     10198 non-null  object 
 6   verified                        11353 non-null  bool   
 7   followers                       11353 non-null  int64  
 8   friends                         11353 non-null  int64  
 9   favourites_count                11353 non-null  int64  
 10  statuses_count                  11353 non-null  int64  
 11  default_profile                 11353 non-null  int64  
 12  default_profile_image           

In [0]:
training = training.drop(columns='url_titles')

In [0]:
training2.to_csv("homelessness_users_features.csv")

In [0]:
training2 = training.dropna(subset=['id'])
training2.shape

(11353, 87)