In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import math 


def ostype_add_directory(os_type, directory):
    base_dir_mac = "/Volumes"
    base_dir_linux = "/media/f140926"
    base_dir = base_dir_mac if os_type.lower() == 'mac' else base_dir_linux
    based_directory = base_dir + directory 
    return based_directory
    

def read_fine_grained(os_type,dir_finegrained,file_number):
    
    finegrained_file = "/fine_grained{}.txt"
    
    fine_grained_location=ostype_add_directory(os_type,dir_finegrained)+ finegrained_file.format(file_number)
    
    df = pd.read_csv(fine_grained_location,
                     sep="|", skiprows=0,
                     header=0, encoding='ISO-8859-1')
    df = df.drop(['Unnamed: 0', 'Unnamed: 5'], axis=1)
    df = df.rename(columns=lambda x: x.strip())
    for i in df.columns:
        df[i] = df[i].astype(str)
    df = df.apply(lambda x: x.str.strip())
    df = df.iloc[1:, :]

    for i in df.columns[1:4]:
        df[i] = df[i].astype(int)        
    df = infer_datetime(df,file_number)
    df = calculate_date_values(df)
    return df


def infer_datetime(df,file_number):
    # get the expected month from the first entry
    expected_month = math.ceil(file_number/2)

    # try to convert the first entry of time with the specified format
    try:
        dt = pd.to_datetime(df.iloc[0,0], format='%Y-%m-%d %H')
        if dt.month == expected_month:
            print('Datetime format is correct')
            df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H')
        else:
            raise ValueError
    except ValueError:
        # if it fails, then try automatic inference
        print('Datetime format is incorrect, trying automatic inference')
        try:
            dt = pd.to_datetime(df.iloc[0,0])
            if dt.month == expected_month:
                print('Automatic inference is successful')
                df['time'] = pd.to_datetime(df['time'])
            else:
                raise ValueError
        except ValueError:
            print('Automatic inference failed')
    return df


def customer_signals_analysis(df):
    df['time'] = pd.to_datetime(df['time'])
    df['hour'] = df['time'].dt.hour
    df['day_of_week'] = df['time'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['is_night'] = df['hour'].isin(range(18, 24)) | df['hour'].isin(range(0, 7)) 
    df['is_day'] = (df['hour'].isin(range(7,18))).astype(int)
    df['is_23'] = (df['hour'] == 23).astype(int)
    df['is_other_times'] = (df['hour'] != 23).astype(int)
    df['day'] = df['time'].dt.date
    df['site_count'] = df.groupby('customer_id')['site_id'].transform('nunique')
    df['day_count'] = df.groupby('customer_id')['day'].transform('nunique')

    customers_analysis = df.groupby('customer_id').agg({
        'time': 'count',
        'is_23': 'sum',
        'is_other_times': 'sum',
        'is_weekend': 'sum',
        'is_night': 'sum',
        'is_day': 'sum',
        'day_count': 'first',
        'site_count': 'first'
    }).rename(columns={
        'time': 'signal_count',
        'is_23': 'signal_at_23',
        'is_other_times': 'signal_at_other_times',
        'is_weekend': 'signal_on_weekend',
        'is_night': 'signal_at_night',
        'is_day': 'signal_during_day',
        'day_count': 'unique_days_count',
        'site_count': 'unique_sites_count'
    })

    return customers_analysis.reset_index()

def filter_customers(cust_df, unique_days_threshold, signal_23_ratio_threshold, weekend_signal_threshold, night_signal_threshold, day_signal_threshold):
    filtered_df = cust_df[
        (cust_df['unique_days_count'] >= unique_days_threshold) &
        (cust_df['signal_at_23'] / cust_df['signal_count'] <= signal_23_ratio_threshold) &
        (cust_df['signal_on_weekend'] > weekend_signal_threshold) &
        (cust_df['signal_at_night'] > night_signal_threshold) &
        (cust_df['signal_during_day'] > day_signal_threshold)
    ]
    return filtered_df


def calculate_date_values(df):
    df['time'] = pd.to_datetime(df['time'])
    df['hour'] = df['time'].dt.hour
    df['day'] = df['time'].dt.day
    df['week'] = df['time'].dt.isocalendar().week
    df['month'] = df['time'].dt.month
    df['year'] = df['time'].dt.year
    df['dayofweek'] = df['time'].dt.dayofweek
    df['is_weekend'] = df['dayofweek'].apply(lambda x: 1 if x >= 5 else 0)
    df['night_dummy'] = df['hour'].apply(lambda x: 1 if x >= 19 or x < 7 else 0)

    return df

def filter_by_hour(df, hour, exclude_hour=False):
    return df[df['hour'] != hour].reset_index(drop=True) if exclude_hour else df[df['hour'] == hour].reset_index(drop=True)
def filter_by_customers(df, customer_list):
    return df[df['customer_id'].isin(customer_list)].reset_index(drop=True)
def filter_by_city(df, city_list, exclude_city_list=False):
    return df[df['city'].isin(city_list)].reset_index(drop=True) if exclude_city_list else df[~df['city'].isin(city_list)].reset_index(drop=True)


def filter_dataset(df, hour, customer_list, city_list, exclude_hour=False, exclude_city_list=False):
    df_filtered_hour = filter_by_hour(df, hour, exclude_hour)
    df_filtered_customer=filter_by_customers(df_filtered_hour, customer_list)
    df_filtered_city_and_customer=filter_by_city(df_filtered_customer,city_list,exclude_city_list)
    return df_filtered_city_and_customer

def customer_signals_analysis(df):
    df['time'] = pd.to_datetime(df['time'])
    df['hour'] = df['time'].dt.hour
    df['day_of_week'] = df['time'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['is_night'] = df['hour'].isin(range(18, 24)) | df['hour'].isin(range(0, 7)) 
    df['is_day'] = (df['hour'].isin(range(7,18))).astype(int)
    df['is_23'] = (df['hour'] == 23).astype(int)
    df['is_other_times'] = (df['hour'] != 23).astype(int)
    df['day'] = df['time'].dt.date
    df['site_count'] = df.groupby('customer_id')['site_id'].transform('nunique')
    df['day_count'] = df.groupby('customer_id')['day'].transform('nunique')

    customers_analysis = df.groupby('customer_id').agg({
        'time': 'count',
        'is_23': 'sum',
        'is_other_times': 'sum',
        'is_weekend': 'sum',
        'is_night': 'sum',
        'is_day': 'sum',
        'day_count': 'first',
        'site_count': 'first'
    }).rename(columns={
        'time': 'signal_count',
        'is_23': 'signal_at_23',
        'is_other_times': 'signal_at_other_times',
        'is_weekend': 'signal_on_weekend',
        'is_night': 'signal_at_night',
        'is_day': 'signal_during_day',
        'day_count': 'unique_days_count',
        'site_count': 'unique_sites_count'
    })

    return customers_analysis.reset_index()

def filter_customers(cust_df, unique_days_threshold, signal_23_ratio_threshold, weekend_signal_threshold, night_signal_threshold, day_signal_threshold):
    filtered_df = cust_df[
        (cust_df['unique_days_count'] >= unique_days_threshold) &
        (cust_df['signal_at_23'] / cust_df['signal_count'] <= signal_23_ratio_threshold) &
        (cust_df['signal_on_weekend'] > weekend_signal_threshold) &
        (cust_df['signal_at_night'] > night_signal_threshold) &
        (cust_df['signal_during_day'] > day_signal_threshold)
    ]
    return filtered_df


tower_location = ostype_add_directory('mac','/Extreme SSD/MPD_based_indicators_of_migration/Seasonal_migration/towers/towers.csv')
dir_finegrained = "/Extreme SSD/Data - Location/Hummingbird_Location_Data/F_Fine_grained_mobility"

dir_summary = "/Extreme SSD/Summary_Data/Fine grained/"

summary_file = "fine_grained{}.csv"

pd.set_option('display.max_columns', None)


In [2]:
x=pd.read_csv('scikit_indicators_week_11.csv')

In [4]:
len(x['customer_id'].unique())

9818

In [None]:
df=read_fine_grained('mac',dir_finegrained, 11)
df_tower= pd.read_csv(tower_location,sep=';')
df=df.merge(df_tower,on='site_id',how='left')

In [76]:
sum_path = ostype_add_directory('mac', dir_summary)+summary_file.format(11)
df_summary = pd.read_csv(sum_path)[['customer_id','city']]
cust_df = customer_signals_analysis(df)
filtered_df = filter_customers(cust_df, unique_days_threshold=10, signal_23_ratio_threshold=0.5, weekend_signal_threshold=5, night_signal_threshold=10, day_signal_threshold=10)
df_filtered=df[df['customer_id'].isin(filtered_df['customer_id'].unique().tolist())==True].reset_index(drop=True)
del(cust_df)
del(filtered_df)


In [77]:
df_filtered.head()

Unnamed: 0.1,time,customer_id,segment,site_id,hour,day,week,month,year,dayofweek,is_weekend,night_dummy,Unnamed: 0,city,district,city_district,long,lat,day_of_week,is_night,is_day,is_23,is_other_times,site_count,day_count
0,2020-06-03 08:00:00,243937161,14,24723,8,2020-06-03,23,6,2020,2,0,0,24623,ISTANBUL,ESENYURT,ISTANBUL_ESENYURT,28.677672,41.027752,2,False,1,0,1,43,15
1,2020-06-03 08:00:00,243937161,14,30783,8,2020-06-03,23,6,2020,2,0,0,30683,ISTANBUL,ESENYURT,ISTANBUL_ESENYURT,28.681245,41.03244,2,False,1,0,1,43,15
2,2020-06-03 08:00:00,243937161,14,31378,8,2020-06-03,23,6,2020,2,0,0,31278,ISTANBUL,ESENYURT,ISTANBUL_ESENYURT,28.664615,41.064309,2,False,1,0,1,43,15
3,2020-06-03 08:00:00,243937161,14,22677,8,2020-06-03,23,6,2020,2,0,0,22577,ISTANBUL,BUYUKCEKMECE,ISTANBUL_BUYUKCEKMECE,28.608653,41.074722,2,False,1,0,1,43,15
4,2020-06-03 10:00:00,243937161,14,1863,10,2020-06-03,23,6,2020,2,0,0,1763,ISTANBUL,BUYUKCEKMECE,ISTANBUL_BUYUKCEKMECE,28.615954,41.089965,2,False,1,0,1,43,15


In [79]:
harvest_cities=['GIRESUN','ORDU','TRABZON']

cust_list_ist=df_summary[df_summary['city']=='ISTANBUL'].reset_index(drop=True)['customer_id'].unique().tolist()
cust_list_giresun=df_summary[df_summary['city']=='GIRESUN'].reset_index(drop=True)['customer_id'].unique().tolist()
cust_list_ordu=df_summary[df_summary['city']=='ORDU'].reset_index(drop=True)['customer_id'].unique().tolist()
cust_list_trabzon=df_summary[df_summary['city']=='TRABZON'].reset_index(drop=True)['customer_id'].unique().tolist()


cust_list_destination=cust_list_ordu+cust_list_giresun+cust_list_trabzon


df_ist=df[df['customer_id'].isin(cust_list_ist)==True].reset_index(drop=True)
customer_ids_of_seasonal_migrants_1=df_ist[df_ist['city'].isin(harvest_cities)==True]['customer_id'].unique().tolist()

df_destination=df[df['customer_id'].isin(cust_list_destination)==True].reset_index(drop=True)
customer_ids_of_seasonal_migrants_2=df_destination[df_destination['city'].isin(['ISTANBUL'])==True]['customer_id'].unique().tolist()


potential_seasonal_migrants=customer_ids_of_seasonal_migrants_1+customer_ids_of_seasonal_migrants_2


cust_list_nonmovers = [item for item in cust_list_ist if item not in potential_seasonal_migrants]

df_potential_seasonal_migrants = pd.DataFrame(potential_seasonal_migrants, columns=['customer_id'])
df_nonmovers= pd.DataFrame(cust_list_nonmovers, columns=['customer_id'])


In [81]:
df_nonmovers[df_nonmovers['customer_id']==425944642]

Unnamed: 0,customer_id
6398,425944642


In [83]:
list_customers=df_nonmovers['customer_id'].unique().tolist()+df_potential_seasonal_migrants['customer_id'].unique().tolist()

# Check the filtering process

In [91]:
df_finegrained_filtered = filter_dataset(df, 23, list_customers, ['ISTANBUL','KOCAELI'], exclude_hour=True, exclude_city_list=True)

In [92]:
df_finegrained_filtered

Unnamed: 0.1,time,customer_id,segment,site_id,hour,day,week,month,year,dayofweek,is_weekend,night_dummy,Unnamed: 0,city,district,city_district,long,lat,day_of_week,is_night,is_day,is_23,is_other_times,site_count,day_count
0,2020-06-03 08:00:00,243937161,14,24723,8,2020-06-03,23,6,2020,2,0,0,24623,ISTANBUL,ESENYURT,ISTANBUL_ESENYURT,28.677672,41.027752,2,False,1,0,1,43,15
1,2020-06-03 08:00:00,243937161,14,30783,8,2020-06-03,23,6,2020,2,0,0,30683,ISTANBUL,ESENYURT,ISTANBUL_ESENYURT,28.681245,41.032440,2,False,1,0,1,43,15
2,2020-06-03 08:00:00,243937161,14,31378,8,2020-06-03,23,6,2020,2,0,0,31278,ISTANBUL,ESENYURT,ISTANBUL_ESENYURT,28.664615,41.064309,2,False,1,0,1,43,15
3,2020-06-03 08:00:00,243937161,14,22677,8,2020-06-03,23,6,2020,2,0,0,22577,ISTANBUL,BUYUKCEKMECE,ISTANBUL_BUYUKCEKMECE,28.608653,41.074722,2,False,1,0,1,43,15
4,2020-06-03 10:00:00,243937161,14,1863,10,2020-06-03,23,6,2020,2,0,0,1763,ISTANBUL,BUYUKCEKMECE,ISTANBUL_BUYUKCEKMECE,28.615954,41.089965,2,False,1,0,1,43,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5732459,2020-06-13 14:00:00,356531457,1,9610,14,2020-06-13,24,6,2020,5,1,0,9510,ISTANBUL,KUCUKCEKMECE,ISTANBUL_KUCUKCEKMECE,28.773220,41.015877,5,False,1,0,1,33,15
5732460,2020-06-13 15:00:00,356531457,1,6562,15,2020-06-13,24,6,2020,5,1,0,6462,ISTANBUL,KUCUKCEKMECE,ISTANBUL_KUCUKCEKMECE,28.781576,41.008192,5,False,1,0,1,33,15
5732461,2020-06-13 20:00:00,356531457,1,9610,20,2020-06-13,24,6,2020,5,1,1,9510,ISTANBUL,KUCUKCEKMECE,ISTANBUL_KUCUKCEKMECE,28.773220,41.015877,5,True,0,0,1,33,15
5732462,2020-06-13 20:00:00,356531457,1,24215,20,2020-06-13,24,6,2020,5,1,1,24115,ISTANBUL,KUCUKCEKMECE,ISTANBUL_KUCUKCEKMECE,28.798007,41.009892,5,True,0,0,1,33,15


In [93]:
filter_dataset(df, 23, list_customers, ['ISTANBUL','KOCAELI'], exclude_hour=True, exclude_city_list=False)

Unnamed: 0.1,time,customer_id,segment,site_id,hour,day,week,month,year,dayofweek,is_weekend,night_dummy,Unnamed: 0,city,district,city_district,long,lat,day_of_week,is_night,is_day,is_23,is_other_times,site_count,day_count
0,2020-06-15 00:00:00,836230856,14,34089,0,2020-06-15,25,6,2020,0,0,1,33989,BOLU,BOLU MERKEZ,BOLU_BOLU MERKEZ,31.659349,40.736554,0,True,0,0,1,80,15
1,2020-06-15 00:00:00,836230856,14,29428,0,2020-06-15,25,6,2020,0,0,1,29328,BOLU,BOLU MERKEZ,BOLU_BOLU MERKEZ,31.659349,40.736554,0,True,0,0,1,80,15
2,2020-06-15 01:00:00,836230856,14,34089,1,2020-06-15,25,6,2020,0,0,1,33989,BOLU,BOLU MERKEZ,BOLU_BOLU MERKEZ,31.659349,40.736554,0,True,0,0,1,80,15
3,2020-06-15 02:00:00,836230856,14,10571,2,2020-06-15,25,6,2020,0,0,1,10471,ANKARA,YENIMAHALLE,ANKARA_YENIMAHALLE,32.726019,40.013798,0,True,0,0,1,80,15
4,2020-06-15 02:00:00,836230856,14,32618,2,2020-06-15,25,6,2020,0,0,1,32518,ANKARA,ETIMESGUT,ANKARA_ETIMESGUT,32.621086,39.865005,0,True,0,0,1,80,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197408,2020-06-13 19:00:00,58004579,2,8197,19,2020-06-13,24,6,2020,5,1,1,8097,MERSIN,YENISEHIR,MERSIN_YENISEHIR,34.513326,36.844688,5,True,0,0,1,137,15
197409,2020-06-13 20:00:00,548709442,5,16421,20,2020-06-13,24,6,2020,5,1,1,16321,MERSIN,TOROSLAR,MERSIN_TOROSLAR,34.404674,37.048784,5,True,0,0,1,137,15
197410,2020-06-13 20:00:00,58004579,2,16421,20,2020-06-13,24,6,2020,5,1,1,16321,MERSIN,TOROSLAR,MERSIN_TOROSLAR,34.404674,37.048784,5,True,0,0,1,137,15
197411,2020-06-13 21:00:00,548709442,5,27014,21,2020-06-13,24,6,2020,5,1,1,26914,MERSIN,TARSUS,MERSIN_TARSUS,34.859416,37.063861,5,True,0,0,1,137,15


In [61]:
scikit_week=pd.read_csv('scikit_indicators_week_11.csv')

In [62]:
scikit_week=scikit_week.drop_duplicates('customer_id').reset_index(drop=True)

In [63]:
scikit_week.sort_values(by='radius_of_gyration_x').head(60)

Unnamed: 0.1,Unnamed: 0,customer_id,radius_of_gyration_x,2k_radius_of_gyration,random_entropy,uncorrelated_entropy,radius_of_gyration_y,maximum_distance,distance_straight_line,waiting_times,number_of_locations,lat,lng,max_distance_from_home,number_of_visits,location_frequency
3330,28922,999717974,0.0,0.0,0.0,0.0,0.0,,0.0,[],1,41.341781,27.950507,0.0,1,1.0
1409,12294,425944642,0.0,0.0,0.0,0.0,0.0,,0.0,[],1,40.176413,29.14116,0.0,1,1.0
1408,12293,425852661,0.0,0.0,0.0,0.0,0.0,,0.0,[],1,41.283713,27.999182,0.0,1,1.0
2663,23014,796729647,0.0,0.0,0.0,0.0,0.0,,0.0,[],1,41.186987,36.65661,0.0,1,1.0
1404,12250,423473456,0.0,0.0,0.0,0.0,0.0,,0.0,[],1,40.823385,30.920821,0.0,1,1.0
2667,23080,797573139,0.0,0.0,0.0,0.0,0.0,,0.0,[],1,40.168929,29.078994,0.0,1,1.0
533,4393,164641008,0.0,0.0,0.0,0.0,0.0,,0.0,[],1,39.657606,30.574094,0.0,1,1.0
1391,12154,418133663,0.0,0.0,0.0,0.0,0.0,,0.0,[],1,41.283713,27.999182,0.0,1,1.0
1390,12153,417471122,0.0,0.0,0.0,0.0,0.0,,0.0,[],1,41.025371,27.89732,0.0,1,1.0
1411,12310,426628831,0.0,0.0,0.0,0.0,0.0,,0.0,[],1,40.17944,28.820402,0.0,1,1.0


In [82]:
df[(df['customer_id']==425944642)&(df['city']=='BURSA')]

Unnamed: 0.1,time,customer_id,segment,site_id,hour,day,week,month,year,dayofweek,is_weekend,night_dummy,Unnamed: 0,city,district,city_district,long,lat,day_of_week,is_night,is_day,is_23,is_other_times,site_count,day_count
10935950,2020-06-12,425944642,2,22949,0,2020-06-12,24,6,2020,4,0,1,22849,BURSA,YILDIRIM,BURSA_YILDIRIM,29.14116,40.176413,4,True,0,0,1,60,15


In [64]:
scikit_weekend=pd.read_csv('scikit_indicators_weekend_11.csv')
scikit_weekend=scikit_weekend.drop_duplicates('customer_id').reset_index(drop=True)
scikit_weekend[scikit_weekend['customer_id']==425944642]

Unnamed: 0.1,Unnamed: 0,customer_id,radius_of_gyration_x,2k_radius_of_gyration,random_entropy,uncorrelated_entropy,radius_of_gyration_y,maximum_distance,distance_straight_line,waiting_times,number_of_locations,lat,lng,max_distance_from_home,number_of_visits,location_frequency


In [65]:
scikit_weekend

Unnamed: 0.1,Unnamed: 0,customer_id,radius_of_gyration_x,2k_radius_of_gyration,random_entropy,uncorrelated_entropy,radius_of_gyration_y,maximum_distance,distance_straight_line,waiting_times,number_of_locations,lat,lng,max_distance_from_home,number_of_visits,location_frequency
0,0,131870,1.053343e+02,103.293759,3.700440,3.029721,1.053343e+02,133.442395,1257.365963,[ 0. 3600. 3600. 0. 0. 3600. ...,13,38.302073,26.642011,295.178447,29,0.413793
1,13,150071,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,[ 0. 0. 3600.],1,41.283713,27.999182,0.000000,4,1.000000
2,14,307628,2.044743e+01,11.866669,1.584963,0.905587,2.044743e+01,39.017973,72.929794,[ 3600. 14400. 0. 18000. 3600. 7200. 36...,3,40.272922,27.972848,72.701802,15,0.800000
3,17,734117,1.343363e+01,2.950209,2.321928,1.014404,1.343363e+01,45.074690,206.309555,[ 0. 0. 3600. 0. 3600. 3600. 0. ...,5,41.047164,30.648963,45.074690,23,0.826087
4,22,1039606,1.208253e+01,6.485465,2.000000,0.453359,1.208253e+01,68.842323,247.318144,[ 3600. 0. 0. 3600. 7200. 0. ...,4,41.782416,27.245204,68.842323,60,0.933333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2617,15751,998486074,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,,0.000000,[],1,39.975527,28.521404,0.000000,1,1.000000
2618,15752,998551050,1.516378e+01,12.111928,1.584963,1.360964,1.516378e+01,40.386277,95.050322,[583200. 3600. 0. 0. 3600. ...,3,40.266087,29.619843,30.289263,10,0.500000
2619,15755,998760828,2.213553e+02,201.604268,4.087463,2.633143,2.213553e+02,130.519371,1075.563876,[ 0. 0. 3600. 0. 0. ...,17,40.993058,34.043953,374.512136,75,0.546667
2620,15772,998838971,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,[10800. 21600. 10800. 7200. 0. 10800. 108...,1,40.952473,39.025068,0.000000,16,1.000000
