In [1]:
import re
import os
import string
import pandas as pd
import datetime as dt

os.chdir('c:\\Users\\AndreaHrelja\\Documents\\Faks\\twitter_scraper\\src')

import utils.fileio as fileio
from twitter_scraper import settings


USER_OBJS_CSV = os.path.join(settings.USER_OBJS_DIR, 'user-objs.csv')
LOCATIONS_JSON = os.path.join(settings.INPUT_DIR, 'locations.json')

locations = pd.read_json(LOCATIONS_JSON)[0]
accepted_chars = string.ascii_lowercase + 'čšćžđ'


#user_df = pd.read_csv('C:\\Users\\AndreaHrelja\\Documents\\Faks\\twitter_scraper\\output\\users\\objs\\2022-02-03\\user-objs.csv')
user_df = pd.read_csv('C:\\Users\\AndreaHrelja\\Documents\\Faks\\twitter_scraper\\debug\\output\\users\\objs\\2022-02-13\\user-objs.csv')
user_df = user_df[user_df['protected'] == False]
user_df['created_at'] = pd.to_datetime(user_df['created_at'], format='%a %b %d %H:%M:%S %z %Y') # 30s

user_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8338 entries, 0 to 8719
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   user_id          8338 non-null   int64              
 1   name             8334 non-null   object             
 2   screen_name      8338 non-null   object             
 3   location         5503 non-null   object             
 4   protected        8338 non-null   bool               
 5   verified         8338 non-null   bool               
 6   followers_count  8338 non-null   int64              
 7   friends_count    8338 non-null   int64              
 8   statuses_count   8338 non-null   int64              
 9   created_at       8338 non-null   datetime64[ns, UTC]
dtypes: bool(2), datetime64[ns, UTC](1), int64(4), object(3)
memory usage: 602.6+ KB


In [8]:
def get_user_df():
    #user_df = pd.read_csv('C:\\Users\\AndreaHrelja\\Documents\\Faks\\twitter_scraper\\output\\users\\objs\\2022-02-03\\user-objs.csv')
    user_df = pd.read_csv(USER_OBJS_CSV)
    user_df = user_df[user_df['protected'] == False]
    user_df['created_at'] = pd.to_datetime(user_df['created_at'], format='%a %b %d %H:%M:%S %z %Y') # 30s
    return user_df


def is_croatian(location):
    global locations
    
    if location == '':
        return False
    
    cro_locations = ('croa', 'hrvat')
    
    if location.lower() in locations:
        return True
    else:
        return any(cro_loc in location.lower() for cro_loc in cro_locations)

def transform_user_df(user_df):
    user_df['is_croatian'] = user_df['location'].fillna('').transform(is_croatian)
    user_df['clean_location'] = user_df[user_df['is_croatian'] == True]['location'].transform(clean_location)
    user_df = user_df[
        (user_df['is_croatian'] == True)
        & (user_df['statuses_count'] > 10)
        & (user_df['friends_count'] > 10)
        & (user_df['friends_count'] < 5000)
        & (user_df['followers_count'] > 10)
        #& (user_df['followers_count'] < 5000)
    ].sort_values(by='followers_count')
    return user_df.reset_index(drop=True)


def clean_location(location):
    if location == '':
        return location
    
    new_location = location.lower()
    location_names = ('republic of croatia', 'republika hrvatska', 'hrvatska', 'croatia', 'croacia', 'croatie')
    
    if re.search(r'[ ]+', location):
        new_location = new_location.replace(re.search(r'[ ]+', location).group(), ' ').strip()
    
    for name in location_names:
        if new_location == name:
            return 'Hrvatska'
    
        for char in location.lower():
            if char not in accepted_chars + ' ':
                new_location = new_location.replace(char, '')
        
        if name in location.lower():
            new_location = new_location.replace(name, '')
        new_location = new_location.strip()
        
    if new_location == '':
        new_location = 'Hrvatska'
    return new_location.title()


def generate_edges_df(user_df):
    not_found = 0
    users_data = []
    total_users = len(user_df.user_id.unique())
    
    for user_id in user_df.user_id.unique():
        user_path = os.path.join(settings.USER_IDS_DIR, '{}.json'.format(user_id))
        if os.path.exists(user_path):
            user = fileio.read_content(user_path, 'json')
            for follower in user.get('followers', []):
                users_data.append({
                    'source': str(follower),
                    'target': str(user_id)
                })
        else:
            not_found += 1
    
    print("Found {}/{} users".format(total_users-not_found, total_users))
    return pd.DataFrame(users_data)

In [11]:
user_df = get_user_df()
user_df = transform_user_df(user_df)
print(user_df.info())
user_df.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705 entries, 0 to 704
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   user_id          705 non-null    int64              
 1   name             705 non-null    object             
 2   screen_name      705 non-null    object             
 3   location         705 non-null    object             
 4   protected        705 non-null    bool               
 5   verified         705 non-null    bool               
 6   followers_count  705 non-null    int64              
 7   friends_count    705 non-null    int64              
 8   statuses_count   705 non-null    int64              
 9   created_at       705 non-null    datetime64[ns, UTC]
 10  is_croatian      705 non-null    bool               
 11  clean_location   705 non-null    object             
dtypes: bool(3), datetime64[ns, UTC](1), int64(4), object(4)
memory usage: 51.8+ KB

Unnamed: 0,user_id,name,screen_name,location,protected,verified,followers_count,friends_count,statuses_count,created_at,is_croatian,clean_location
128,1074194643489828864,Blaž Ereš,Baka19821,Republic of Croatia,False,False,60,233,1098,2018-12-16 06:48:55+00:00,True,Hrvatska
304,2938146454,Dalmatia Exclusive,DalmatiaE,"Brela, Croatia",False,False,215,328,722,2014-12-23 09:33:09+00:00,True,Brela
152,269940296,Ivan Brkic,ivanbrkiczd,"Zadar,Croatia",False,False,74,253,71,2011-03-21 18:42:36+00:00,True,Zadar
131,16208756,ZelimirGraf,ZelimirGraf,Croatia,False,False,61,392,29,2008-09-09 19:35:00+00:00,True,Hrvatska
77,64349865,Semso,vranic,Zagreb-Croatia,False,False,40,258,205,2009-08-10 06:34:22+00:00,True,Zagreb


In [13]:
edges_df = generate_edges_df(user_df)
edges_df.info()

Found 9/705 users
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Empty DataFrame

In [15]:
os.path.exists(os.path.join(settings.USER_IDS_DIR, '{}.json'.format(37629519)))

True

In [16]:
user_df[user_df['user_id'] == 37629519]

Unnamed: 0,user_id,name,screen_name,location,protected,verified,followers_count,friends_count,statuses_count,created_at,is_croatian,clean_location
