## Modules

In [1]:
#Data wrangling
import pandas as pd
import plotly.express as px
import numpy as np

import datetime

from statsmodels.tsa.api import SimpleExpSmoothing

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Environment setupt

pd.set_option('display.float_format', lambda x: '%.5f' % x)

## Classes

In [45]:
class FeatureEngineering():
    
    def __init__(self,df):
        self.df= df
        self.temp= pd.DataFrame()
        
    def createLags(self,column):
        temp= self.df[[column]]
        for i in range(1,34):
            temp[column+"_"+str(i)] = temp[column].shift(i)
        self.temp=temp
    
    def smoothing(self, column):
        for i in range(2,10,1):
            j=i/10
            fit1 = SimpleExpSmoothing(self.temp[column], initialization_method="heuristic").fit(smoothing_level=j, optimized=False)
            lista=fit1.fittedvalues
            self.temp['exp'+str(j)]=lista
            
    def createMA(self,column):
        for i in range(10,50,10):
            self.temp["MA"+str(i)]= self.temp[column].rolling(window =i).mean()
    
    def splitData(self,column):
        X_train, X_test, y_train, y_test = train_test_split( self.temp.drop([column],axis=1).to_numpy(),self.temp[[column]].to_numpy(), shuffle=False)
        sc = MinMaxScaler()
        sc2 = MinMaxScaler()

        sc.fit(X_train)
        sc2.fit(y_train)

        X_train, X_test = [sc.transform(x) for x in [X_train, X_test]]
        y_train, y_test= sc2.transform(y_train),sc2.transform(y_test)
        X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
        X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
        
        return X_train, X_test, y_train, y_test,sc,sc2
    
    def createVariables(self,column):
        self.createLags(column)
        self.smoothing(column)
        self.createMA(column)
        self.temp.dropna(inplace=True)
        
    
        

In [24]:
class CubeCrimesGenerator():
    
    def __init__(self,df):
        self.df = df.copy()
        data= self.df.copy()
        data["Date"]=data["date_rptd"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
        self.start= str(data["Date"].min())
        self.end= str(data["Date"].max())
        self.ls_dfs=[]
    
    def checkDates(self,data,zone,crimeType):
        data["Date"]=data["date_rptd"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
        date_index = pd.date_range(start=self.start, end=self.end, freq="D")
        data = data.set_index("Date").reindex(date_index)
        data.drop(columns=["date_rptd"], inplace=True)
        field = "Crimes_Z"+str(zone)+"T"+str(crimeType)
        data.rename(columns={0: field}, inplace= True)
        data[field]= data[field].fillna(0)
        return data
    
    
    def generateDataframes(self):
        ls_dfs= []
        for zone in range(5):
            test = self.df[self.df['zonas']==zone].copy()
            for crime in range(100,105):
                test2 = test[test['crm_cd']==crime].copy()
                temp =test2.groupby('date_rptd').size().copy()
                temp= temp.to_frame()
                temp= temp.reset_index()
                data= self.checkDates(temp,zone,crime)
                self.ls_dfs.append(data)
                
    def generateCube(self):
        self.generateDataframes()
        df_final= self.ls_dfs[0]

        for i in range(1, len(self.ls_dfs)):
            column= self.ls_dfs[i].columns[0]
            df_final[column]= list(self.ls_dfs[i][column])
            
        return df_final
                

## Test

In [2]:
df = pd.read_csv('DatosLimpiosSuperclases.csv')

zonas=pd.read_csv('clusters.csv')

df=df.merge(zonas,left_on='dr_no',right_on='dr_no')

df_ = df[['date_rptd','crm_cd','zonas']]

In [6]:
df.sample(700).to_csv("DatosMuestra.csv")

In [26]:
test= CubeCrimesGenerator(df_)

In [27]:
cubo= test.generateCube()

In [28]:
cubo

Unnamed: 0,Crimes_Z0T100,Crimes_Z0T101,Crimes_Z0T102,Crimes_Z0T103,Crimes_Z0T104,Crimes_Z1T100,Crimes_Z1T101,Crimes_Z1T102,Crimes_Z1T103,Crimes_Z1T104,...,Crimes_Z3T100,Crimes_Z3T101,Crimes_Z3T102,Crimes_Z3T103,Crimes_Z3T104,Crimes_Z4T100,Crimes_Z4T101,Crimes_Z4T102,Crimes_Z4T103,Crimes_Z4T104
2020-01-01,80,37,1,38,40,26,11,0.00000,5,12,...,10,7,1.00000,7,19,4.00000,4.00000,1.00000,6.00000,2.00000
2020-01-02,55,37,7,52,62,17,15,3.00000,7,23,...,8,10,0.00000,18,22,8.00000,5.00000,0.00000,4.00000,4.00000
2020-01-03,42,32,6,46,58,24,18,3.00000,25,29,...,11,11,0.00000,20,17,3.00000,3.00000,0.00000,4.00000,10.00000
2020-01-04,52,29,5,53,86,14,7,5.00000,10,28,...,11,11,6.00000,18,22,5.00000,3.00000,1.00000,8.00000,4.00000
2020-01-05,48,43,7,51,62,12,17,4.00000,11,28,...,12,14,4.00000,19,23,10.00000,7.00000,1.00000,1.00000,3.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-23,40,32,21,57,102,24,10,13.00000,26,30,...,13,17,13.00000,29,42,5.00000,3.00000,2.00000,6.00000,10.00000
2023-02-24,48,32,15,65,91,12,9,2.00000,25,34,...,11,10,6.00000,31,24,2.00000,3.00000,2.00000,10.00000,14.00000
2023-02-25,53,38,8,53,75,8,3,1.00000,7,19,...,11,2,3.00000,16,27,9.00000,2.00000,1.00000,3.00000,3.00000
2023-02-26,53,40,7,44,81,24,14,12.00000,29,46,...,13,7,5.00000,27,28,9.00000,2.00000,0.00000,1.00000,9.00000


In [None]:
cubo[["Crimes_Z0T100"]]

Unnamed: 0,Crimes_Z0T100
2020-01-01,80
2020-01-02,55
2020-01-03,42
2020-01-04,52
2020-01-05,48
...,...
2023-02-23,40
2023-02-24,48
2023-02-25,53
2023-02-26,53


In [46]:
feat= FeatureEngineering(cubo)

In [47]:
feat.createVariables("Crimes_Z0T100")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp[column+"_"+str(i)] = temp[column].shift(i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp[column+"_"+str(i)] = temp[column].shift(i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp[column+"_"+str(i)] = temp[column].shift(i)
A value is trying to be set on a copy of a slice from a DataF

In [48]:
feat.temp

Unnamed: 0,Crimes_Z0T100,Crimes_Z0T100_1,Crimes_Z0T100_2,Crimes_Z0T100_3,Crimes_Z0T100_4,Crimes_Z0T100_5,Crimes_Z0T100_6,Crimes_Z0T100_7,Crimes_Z0T100_8,Crimes_Z0T100_9,...,exp0.4,exp0.5,exp0.6,exp0.7,exp0.8,exp0.9,MA10,MA20,MA30,MA40
2020-02-09,59,70.00000,74.00000,46.00000,58.00000,48.00000,72.00000,66.00000,61.00000,58.00000,...,65.08780,66.49352,67.84461,68.97577,69.76723,70.13123,61.20000,64.60000,63.30000,61.95000
2020-02-10,67,59.00000,70.00000,74.00000,46.00000,58.00000,48.00000,72.00000,66.00000,61.00000,...,62.65268,62.74676,62.53784,61.99273,61.15345,60.11312,62.10000,65.05000,63.86667,61.62500
2020-02-11,69,67.00000,59.00000,70.00000,74.00000,46.00000,58.00000,48.00000,72.00000,66.00000,...,64.39161,64.87338,65.21514,65.49782,65.83069,66.31131,62.90000,65.35000,64.30000,61.97500
2020-02-12,69,69.00000,67.00000,59.00000,70.00000,74.00000,46.00000,58.00000,48.00000,72.00000,...,66.23497,66.93669,67.48605,67.94935,68.36614,68.73113,63.20000,65.10000,64.56667,62.65000
2020-02-13,76,69.00000,69.00000,67.00000,59.00000,70.00000,74.00000,46.00000,58.00000,48.00000,...,67.34098,67.96834,68.39442,68.68480,68.87323,68.97311,63.60000,65.15000,65.20000,63.25000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-23,40,59.00000,70.00000,64.00000,71.00000,95.00000,65.00000,64.00000,80.00000,59.00000,...,65.92047,64.49253,63.20725,62.07217,61.04489,60.04910,66.70000,68.65000,67.33333,65.85000
2023-02-24,48,40.00000,59.00000,70.00000,64.00000,71.00000,95.00000,65.00000,64.00000,80.00000,...,55.55228,52.24627,49.28290,46.62165,44.20898,42.00491,65.60000,67.85000,66.96667,65.62500
2023-02-25,53,48.00000,40.00000,59.00000,70.00000,64.00000,71.00000,95.00000,65.00000,64.00000,...,52.53137,50.12313,48.51316,47.58650,47.24180,47.40049,62.90000,67.45000,66.63333,65.47500
2023-02-26,53,53.00000,48.00000,40.00000,59.00000,70.00000,64.00000,71.00000,95.00000,65.00000,...,52.71882,51.56157,51.20526,51.37595,51.84836,52.44005,61.80000,67.20000,66.30000,65.52500


In [49]:
 X_train, X_test, y_train, y_test,sc,sc2= feat.splitData("Crimes_Z0T100")