In [2]:
import pandas as pd
import numpy as np
import csv
import pickle


class Cleaner():

    def __init__(self,data):
        self.df=data.copy()
        self.df.loc[self.df.vict_age==0,'vict_age']=np.nan
        self.df.loc[(self.df.vict_age<0)|(self.df.vict_age>100),'vict_age']=np.nan
        self.dc_cat= dc_cat={'status':['AA', 'AO', 'IC', 'JA'],
                            'vict_sex': ['F', 'H', 'M'],
                            'vict_descent': ['A', 'B', 'C', 'D', 'F', 'G', 'H',
                                              'I', 'J', 'K', 'L', 'O', 'P', 'S','U', 'V', 'W', 'X'],
                            'ucr': ['AGG. Assults', 'BRGLARY', 'BTFV', 'Homicide', 'MVT', 'OTHER THEFT',
                                    'PERSONAL THFT', 'Rape', 'Robbery', 'SIMPLEASSAULT']}
        
    def dummies(self,df, ls,column):
        # Esta función la hice porque luego el OHE depende del orden de los factores, para eso hice el dc_cat
        # Así mantenemos el mismo formato siempre
        n=len(df)
        dc_values={key:[0 for x in range(n)] for key in ls}
        i=0
        for _,row in df.iterrows():
            try:
                a=ls.index(row[column])
                dc_values[row[column]][i]=1
            except:
                pass
            i+=1
        return pd.DataFrame(dc_values, index=df.index)
        
    def manageNan(self):
        # Concatenate crimes codes
        crm_cd_concat=self.df[['crm_cd_1','crm_cd_2','crm_cd_3','crm_cd_4']].fillna('').astype('string')
        crm_cd_concat=crm_cd_concat['crm_cd_1']+crm_cd_concat['crm_cd_2']+crm_cd_concat['crm_cd_3']+crm_cd_concat['crm_cd_4']
        self.df['crm_cd_concat']=crm_cd_concat
        # fill nan's and typos
        self.df['weapon_desc'].fillna('NO WEAPON',inplace=True)
        self.df['weapon_used_cd'].fillna('0',inplace=True)
        self.df.loc[self.df.vict_sex.isna(),'vict_sex']='X'
        self.df.loc[self.df.vict_sex=='H','vict_sex']=='X'
        self.df.loc[self.df.vict_descent.isna(),'vict_descent']='U'
        self.df.loc[self.df.lon==0,['lat','lon']]=np.nan
        # Drop columns
        self.df.drop(['crm_cd_1','crm_cd_2','crm_cd_3','crm_cd_4','area_name','crm_cd_desc','premis_desc','weapon_desc', 'status_desc'],axis=1,inplace=True)
        try:
            self.df.drop(self.df.loc[self.df.status=='CC'].index,inplace=True)
        except:
            pass
        
    def manageLocation(self):
        # Delete spaces
        self.df.location = self.df.location.str.strip()
        self.df.location = self.df.location.apply(lambda x: "  ".join(x.split()))
        self.df.location = self.df.location.str.replace(" ", "")
        self.df.location = self.df.location.str.upper()
        # Replace with dictonary values
        directorio = pd.read_excel('Directorio_Lat_Lon.xlsx')
        directorio.location = directorio.location.str.replace(" ", "")
        directorio.location = directorio.location.str.upper()
        loc_lat = dict(zip(directorio.location,directorio.lat))
        loc_lon = dict(zip(directorio.location,directorio.lon))
    
        # Verificar si las columnas "lat" o "lon" tienen valores faltantes
        mask = self.df['lat'].isna() | self.df['lon'].isna()
        # Verificar si los valores de la columna "location" están en las claves del diccionario "loc_lat"
        not_found = ~self.df.loc[mask, 'location'].isin(loc_lat.keys())
        self.df.loc[mask & ~not_found, 'lat'] = self.df.loc[mask & ~not_found, 'location'].map(loc_lat)
        self.df.loc[mask & ~not_found, 'lon'] = self.df.loc[mask & ~not_found, 'location'].map(loc_lon)
        ## Aqui duda, borramos los que no estan en el diccionario y son nulos o que sugieren?
        self.df.drop('location',axis=1,inplace=True)
        
    def clean(self):
        ## Esta funcion ya nos deja los datos sin nulos
        self.df.drop('cross_street',axis=1,inplace=True)
        self.manageNan()
        self.manageLocation()
    
    def createDummies(self):
        ## Genera las dummies de las categoricas
        imputar=self.df.drop(['dr_no', 'date_rptd', 'date_occ','area',
                         'rpt_dist_no','part_1_2','crm_cd','mocodes','premis_cd',
                         'weapon_used_cd','crm_cd_concat','rpt_dist_no'],axis=1).copy()
        dum1= self.dummies(imputar,self.dc_cat["status"],"status")
        imputar.drop('status',axis=1,inplace=True)
        dum2= self.dummies(imputar,self.dc_cat["vict_sex"],"vict_sex")
        imputar.drop('vict_sex',axis=1,inplace=True)
        dum3= self.dummies(imputar,self.dc_cat["vict_descent"],"vict_descent")
        imputar.drop('vict_descent',axis=1,inplace=True)
        dum4= self.dummies(imputar,self.dc_cat["ucr"],"ucr")
        imputar.drop('ucr',axis=1,inplace=True)
        
        imputar=pd.concat([imputar,dum1,dum2,dum3,dum4],axis=1)

        return imputar
    
    def imputeAge(self):
        ## creo que este segmento que es del codigo segun yo, podría ser otra función aparte no?
        ucr=pd.read_csv('ucr.csv')
        ucr=ucr.astype({'Code':'string'})
        self.df=self.df.astype({'crm_cd':'string'})
        ucr=dict(zip(ucr['Code'],ucr['Subcategory']))
        self.df['ucr']=self.df.crm_cd.map(ucr)
        self.df.ucr=self.df.ucr.fillna('Other')

        ## Generamos las dummies y tomamos los registros sin valores
        imputar=self.createDummies()
        X=imputar[imputar.vict_age.isna()].copy()
        y=X.vict_age.values
        #print(X.shape)
        X=X.drop('vict_age',axis=1).values
        #print(X.shape)
        ## Aplicamos el modelo del vecino cercano
        file = open("models/escaladorXEdad.pkl",'rb')
        sc_X = pickle.load(file)
        file.close()
        
        X=sc_X.transform(X)
        clf=pickle.load(open('models/edad_regresion.sav', 'rb'))
        y=clf.predict(X)
        self.df.loc[self.df.vict_age.isna(),'vict_age']=y
        
    def imputePremis(self):
        ## Aquí veo que usamos de nuevo el crear dummies, hay que ver como solucionar esto jaja
        imputar=self.createDummies()
        X=imputar.loc[self.df.premis.isna(),imputar.columns!='premis']
        
        sc_X=pickle.load(open("models/escaladorXPremis.pkl", "rb"))
        X= sc_X.transform(X)
        
        clf=pickle.load(open('models/premis_clasificacion.sav', 'rb'))
        self.df.loc[self.df.premis.isna(),'premis']=clf.predict(X)
    
    def col_corr(self,codigo,descripcion):
        ## Honestamente no recuero que hacía esta mamada, ayudaaaa 
        dicc=dict(zip(self.df[codigo],self.df[descripcion]))
        w = csv.writer(open(codigo+".csv", "w"))
        for key, val in dicc.items():
            w.writerow([key, val])
        self.df.drop(descripcion,axis=1,inplace=True)
        
    def categoricas(self):
        ## Aqui hay que generar el listado de cuales van a cada categoría
        features=['rpt_dist_no','weapon_used_cd','premis']
        for feature in features:
            aux = self.df[feature].value_counts(True,dropna=False)
            ls_categories = [category for category, freq in aux.items() if freq > 0.03 or category is np.nan]
            self.df.loc[:,feature] = self.df[feature].map(lambda x: x if (x in ls_categories or x is np.nan) else "Others")
            
    def clipping(self):
        self.df['lon']=self.df['lon'].clip(-118.7, -118.1)
        self.df['lat']=self.df['lat'].clip(33.6, 34.4)
        
    def lugar(self):
        pl=[[[119,120,121,145,146.0,150.0,501.0, 502.0, 504.0,507.0, 508.0, 509.0, 510.0,511,515.0, 516.0,707],['vivienda']],
            [[101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 109.0, 110.0,116,117,
              124.0, 125.0,128,136.0, 137.0,152,154,158,415,705.0,748.0],['calle']],
            [[725.0, 726.0,240.0,732.0,752,753,756.0],['gobierno']],
            [[123,142,156,204.0, 213.0],['estacionamiento']],
            [[151.0,157.0,238,241,254,302,303,304.0, 305.0,702],['establecimiento']],
            [[201.0, 202.0, 203.0,205.0, 206.0,208,209.0, 210.0, 211.0,217.0, 218.0,
                                219.0, 220.0, 221.0, 222.0, 223.0, 224.0, 225.0,228,233.0,242,244.0, 245.0,
                                247.0, 248.0, 249.0,250.0, 251.0, 252.0,255,401.0, 402.0, 404.0, 405.0,
                                406.0, 407.0, 408.0,409.0, 410.0, 411.0, 412.0, 413.0, 414.0,417.0,703,
                                148.0,709,301,216
                                ],['establecimiento_publico']],
            [[207,706.0,733.0,735.0],['nocturno']],
            [[503,505,519.0],['hotel']],
            [[111.0, 112.0, 113.0, 114.0, 115.0,122,126,212.0, 214.0, 215.0,801.0, 802.0,804,809.0, 810.0,
                 811.0, 834.0, 835.0, 836.0, 868.0, 869.0, 870.0, 871.0, 872.0, 873.0,
                 874.0, 875.0, 876.0, 877.0, 878.0, 879.0, 880.0, 882.0, 883.0, 884.0,
                                885.0, 889.0, 890.0, 891.0, 892.0, 893.0, 894.0, 895.0, 896.0, 897.0,
                                898.0, 900.0, 901.0, 902.0, 903.0, 904.0, 905.0, 906.0, 907.0, 908.0,
                                909.0, 910.0, 911.0, 912.0, 913.0, 915.0, 916.0, 917.0, 918.0, 919.0,
                                920.0, 921.0, 922.0, 931.0, 932.0, 933.0, 934.0, 935.0, 936.0, 937.0,
                                940.0, 941.0, 942.0, 943.0, 944.0, 945.0, 946.0, 947.0, 948.0, 949.0,
                                950.0, 951.0, 952.0, 953.0, 954.0, 956.0, 957.0, 958.0, 961.0, 962.0,
                                963.0, 964.0, 965.0, 966.0, 967.0, 968.0, 969.0, 970.0, 971.0,129,135
                               ],['transporte']],
            [[138,143,727,140],['escaleras']],
            [[139,108,147.0,712.0,713,144,714.0, 715.0, 716.0, 717.0, 718.0,711,734.0, 736.0, 737.0, 738.0, 739.0,
                  742.0, 743.0,754,757.0, 758.0],['recreacion']],
            [[141,149,155,745.0,107.0,118,506.0,518.0,127],['espacio_abierto']],
            [[227.0,234.0, 235.0, 236.0, 237.0,239,246.0,253,403,701.0,719,755],['medico']],
            [[229,601,602.0, 603.0, 604.0, 605.0, 606.0, 607.0, 608.0],['financiero']],
            [[230.0, 512.0,514,517],['cuidado_personas']],
            [[231.0,704.0,720.0, 721.0, 722.0, 723.0,724.0,729],['escuela']],
            [[232.0,245.0,513,710,744.0,746.0],['otro']],
            [[709.0, 730.0, 731.0,740],['iglesia']],
            [[750.0, 751.0],['internet']]]
        premis=[]
        for secc in pl:
            premis+=list(zip(secc[0],secc[1]*len(secc[0])))
        premis=dict(premis)
        self.df['premis']=self.df.premis_cd.map(premis)
    def zonas(self):
        clusters=pickle.load(open('cluster5.sav','rb'))
        self.df['zonas']=clusters.predict(self.df[['lat','lon']])
    def categorias(self):
        columnas=['weapon_used_cd','premis']
        for col in columnas:
            with open("Data/Cleaner/"+col+"categorias", "rb") as fp:  
                lista = pickle.load(fp)
            self.df[col]=self.df[col].apply(lambda x: x if x in lista else 'Others')
    def simple_imp(self):
        with open("Data/Cleaner/imputer_mode", "rb") as fp:
            imputador== pickle.load(fp)
        self.df[['premis_cd','crm_cd','status']]=imputador.transform(self.df[['premis_cd','crm_cd','status']])
    def super_clases(self):
        temp=pd.read_csv('Data/Cleaner/crcd_Spcls.csv')
        temp.drop('NewCode',axis=1,inplace=True)
        self.df=self.df.merge(temp, how='left', left_on='crm_cd', right_on='crm_cd')
        self.df.drop('crm_cd',axis=1,inplace=True)
        self.df.rename(columns = {'Superclass':'crime'}, inplace = True)
## Aquí empieza el test por así decirlo, deberíia ser las instrucciones a hacer para dejarlos limpios



In [7]:
df=pd.read_csv("lapd.csv")
sample= df.iloc[:2000,:].copy()
df.shape

(682335, 28)

In [8]:
test= Cleaner(sample)

test.clean()

test.imputeAge()

test.clipping()

test.lugar()

test.categoricas()

test.imputePremis()
df.shape

FileNotFoundError: [Errno 2] No such file or directory: 'Directorio_Lat_Lon.xlsx'

In [28]:
temp

Unnamed: 0,Superclass,crm_cd
0,Robo Vehículo,330
1,Robo Vehículo,331
2,Robo Vehículo,420
3,Robo Vehículo,510
4,Robo Vehículo,522
...,...,...
137,Agresión,932
138,Agresión,933
139,Agresión,940
140,Agresión,943


In [35]:
df

Unnamed: 0,dr_no,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1_2,crm_cd_desc,mocodes,...,status_desc,crm_cd_1,location,lat,lon,crm_cd_2,cross_street,crm_cd_3,crm_cd_4,crime
0,10304468,2020-01-08T00:00:00.000,2020-01-08T00:00:00.000,2230,3,Southwest,377,2,BATTERY - SIMPLE ASSAULT,0444 0913,...,Adult Other,624.0,1100 W 39TH PL,34.0141,-118.2978,,,,,Robo Vehículo
1,190101086,2020-01-02T00:00:00.000,2020-01-01T00:00:00.000,330,1,Central,163,2,BATTERY - SIMPLE ASSAULT,0416 1822 1414,...,Invest Cont,624.0,700 S HILL ST,34.0459,-118.2545,,,,,Robo Vehículo
2,200110444,2020-04-14T00:00:00.000,2020-02-13T00:00:00.000,1200,1,Central,155,2,SEX OFFENDER REGISTRANT OUT OF COMPLIANCE,1501,...,Adult Arrest,845.0,200 E 6TH ST,34.0448,-118.2474,,,,,Agresión
3,191501505,2020-01-01T00:00:00.000,2020-01-01T00:00:00.000,1730,15,N Hollywood,1543,2,VANDALISM - MISDEAMEANOR ($399 OR UNDER),0329 1402,...,Invest Cont,745.0,5400 CORTEEN PL,34.1685,-118.4019,998.0,,,,Disturbios a la sociedad
4,191921269,2020-01-01T00:00:00.000,2020-01-01T00:00:00.000,415,19,Mission,1998,2,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",0329,...,Invest Cont,740.0,14400 TITUS ST,34.2198,-118.4468,,,,,Disturbios a la sociedad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
682330,230806871,2023-03-03T00:00:00.000,2023-03-02T00:00:00.000,620,8,West LA,889,2,THEFT OF IDENTITY,1822 0100,...,Invest Cont,354.0,2000 S HOLT AV,34.0412,-118.3814,,,,,Falta a la ley
682331,231104474,2023-01-12T00:00:00.000,2023-01-12T00:00:00.000,1240,11,Northeast,1107,1,THEFT PLAIN - PETTY ($950 & UNDER),1822 0344,...,Invest Cont,440.0,1000 MILWAUKEE AV,34.1214,-118.1915,,,,,Robo
682332,230804266,2023-01-08T00:00:00.000,2023-01-08T00:00:00.000,1030,8,West LA,839,1,"THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LI...",0344 1822,...,Invest Cont,341.0,10200 SANTA MONICA BL,34.0611,-118.4184,,,,,Robo
682333,231604807,2023-01-27T00:00:00.000,2023-01-26T00:00:00.000,1800,16,Foothill,1663,2,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",1300 0329,...,Invest Cont,740.0,12500 BRANFORD ST,34.2466,-118.4054,,,,,Disturbios a la sociedad
