In [None]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geoip2.database
import myLogReader as mlr
import re
import os
import sys
import datetime as dt
from pandas.tools.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_acf
from sklearn.metrics import mean_absolute_error

%matplotlib inline

## Automate - Read and Prep log data into DF

In [None]:
logsPath = '../data/logs'
geoLiteIPDBPath = '../data/GeoLite2-City_20181009/GeoLite2-City.mmdb'

#Create a myLogReader object
myLogReader = mlr.log()
#Open Reader
myLogReader.openReader(geoLiteIPDBPath)

In [None]:
for i in range(1,304):
    print ('AggregatedData_week_%d.csv' % i)
    df =  myLogReader.readLogs(logsPath,7)
    df.to_csv('../data/AggregatedData_week_%d.csv' % i)

In [None]:
#Close Reader
myLogReader.closeReader()

In [None]:
df.head()

## 2- Load PlacementSummary and Logs Data aggregated per week

In [None]:
log_aggregated_per_week_df = pd.read_csv('../data/LogsAggregatedData_per_week.csv')
log_aggregated_per_week_df.set_index('calendar-year-week',inplace=True)

In [None]:
log_aggregated_per_week_df.head()

In [None]:
placement_summary_df = pd.read_csv('../data/PlacementSummary.csv')
placement_summary_df.set_index('YearWeek',inplace=True)

In [None]:
placement_summary_df.head()

In [None]:
log_and_placements_aggregated_per_week_df = log_aggregated_per_week_df.join(placement_summary_df)
log_and_placements_aggregated_per_week_df.head()

In [None]:
log_and_placements_aggregated_per_week_df.isna().sum()

In [None]:
log_and_placements_aggregated_per_week_df.info()

In [None]:
#Subset of only 2018 first weeks
log_and_placements_aggregated_per_week_df =log_and_placements_aggregated_per_week_df.iloc[3:30,:]

In [None]:
fig_size= plt.rcParams["figure.figsize"]

# Set figure width to 12 and height to 9
fig_size[0] = 20
fig_size[1] = 7
plt.rcParams["figure.figsize"] = fig_size


print ("Current size:",fig_size)

plt.plot(log_and_placements_aggregated_per_week_df.index,log_and_placements_aggregated_per_week_df['client-ip-unique-count'],label="Unique IP")
plt.plot(log_and_placements_aggregated_per_week_df.index,log_and_placements_aggregated_per_week_df['PlacementAllocatedInProgressCount'],label="Placement in progress")
plt.plot(log_and_placements_aggregated_per_week_df.index,log_and_placements_aggregated_per_week_df['cs-username-unique-count'],label="Unique User")
plt.legend()

In [None]:
log_and_placements_aggregated_per_week_df.columns

In [None]:
sns.pairplot(data=log_and_placements_aggregated_per_week_df
             ,vars=['client-ip-unique-count','cs-username-unique-count',
                    'PlacementCount', 'PlacementCompletedCount',
                    'PlacementAllocatedInProgressCount'])

In [None]:
df_train = log_and_placements_aggregated_per_week_df.loc[:"2018-37",['client-ip-unique-count']]
df_test= log_and_placements_aggregated_per_week_df.loc["2018-38":,['client-ip-unique-count']]

#### Autocorrelation

In [None]:
plot_acf(df_train,lags=30)

In [None]:
model = sm.tsa.ARIMA(df_train,(1,0,0)).fit()

predictions = model.predict('2018-38','2018-40',dynamic=True)

In [None]:
#log_and_placements_aggregated_per_week_df.loc[:, log_and_placements_aggregated_per_week_df.dtypes == np.int64] = log_and_placements_aggregated_per_week_df.loc[:, log_and_placements_aggregated_per_week_df.dtypes == np.int64].astype(float)
log_and_placements_aggregated_per_week_df.info()

## OTHER stuff


In [None]:
df.info()

In [None]:
df['client-ip'].describe()

In [None]:
log_and_placements_aggregated_per_week_df.loc[:, log_and_placements_aggregated_per_week_df.dtypes == object].describe()