In [2]:
import geopandas as gpd
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from datetime import datetime
from pandas import Timestamp
from sklearn.preprocessing import OneHotEncoder

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

Data Preprocessing

In [3]:
change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
       'Mega Projects': 5}

## Read csvs

train = gpd.read_file('train.geojson', index_col=0)
test = gpd.read_file('test.geojson', index_col=0)


In [4]:
train_df = train.copy()
test_df=test.copy()

In [5]:
# Checking datatypes
print(train_df.columns)
print([type(train_df[column][0]) for column in train_df.columns])

Index(['urban_type', 'geography_type', 'change_type', 'img_red_mean_date1',
       'img_green_mean_date1', 'img_blue_mean_date1', 'img_red_std_date1',
       'img_green_std_date1', 'img_blue_std_date1', 'img_red_mean_date2',
       'img_green_mean_date2', 'img_blue_mean_date2', 'img_red_std_date2',
       'img_green_std_date2', 'img_blue_std_date2', 'img_red_mean_date3',
       'img_green_mean_date3', 'img_blue_mean_date3', 'img_red_std_date3',
       'img_green_std_date3', 'img_blue_std_date3', 'img_red_mean_date4',
       'img_green_mean_date4', 'img_blue_mean_date4', 'img_red_std_date4',
       'img_green_std_date4', 'img_blue_std_date4', 'img_red_mean_date5',
       'img_green_mean_date5', 'img_blue_mean_date5', 'img_red_std_date5',
       'img_green_std_date5', 'img_blue_std_date5', 'date0',
       'change_status_date0', 'date1', 'change_status_date1', 'date2',
       'change_status_date2', 'date3', 'change_status_date3', 'date4',
       'change_status_date4', 'index', 'geometry']

- It seems like urban_type, geography_type, change_type are string types. We have to find a way to translate it into number to manipulate a numbaer matrix.
- All the information about the tones on the image are in numbers, so we'll keep it that way.
- "index" does not give meaningful information about the data
- change_status_day is a string as well that we can evaluate as a number
- dateN is a string for a date
- geometry is a list of vectors that cannot be treated like so, so we are going to have to modify it.

In [6]:
#rectification for change_type

train_df=train_df.replace(to_replace="Demolition",value=0)
train_df=train_df.replace(to_replace="Road",value=1)
train_df=train_df.replace(to_replace="Residential",value=2)
train_df=train_df.replace(to_replace="Commercial",value=3)
train_df=train_df.replace(to_replace="Industrial",value=4)
train_df=train_df.replace(to_replace="Mega Projects",value=5)

In [7]:
#Calcul de l'aire du polygone

train_df=train_df.assign(area=lambda x: x.geometry.area)

test_df=test_df.assign(area=lambda x: x.geometry.area)


  train_df=train_df.assign(area=lambda x: x.geometry.area)

  test_df=test_df.assign(area=lambda x: x.geometry.area)


In [8]:
train_df=train_df.drop(columns=['geometry'],axis=1)
test_df=test_df.drop(columns=['geometry'],axis=1)

In [9]:
#one hot encoding for geography_type

geo_typet=pd.DataFrame(train_df[['geography_type','index']])
geo_typet['geography_type'] = geo_typet['geography_type'].str.split(',')
geo_typet = geo_typet.explode('geography_type')
y = geo_typet['geography_type'].str.get_dummies(sep=',')
geo_typet = pd.concat([geo_typet,y], axis = 1)
geo_typet = geo_typet.groupby(['index']).sum().reset_index()
geo_typet=geo_typet.drop(columns=['A','N'],axis=1)
train_df=pd.merge(train_df,geo_typet,left_on='index',right_on='index')
train_df=train_df.drop(columns=['geography_type'],axis=1)

In [10]:
#one hot encoding for geography_type on test

geo_typet_t=pd.DataFrame(test_df[['geography_type','index']])
geo_typet_t['geography_type'] = geo_typet_t['geography_type'].str.split(',')
geo_typet_t = geo_typet_t.explode('geography_type')
y = geo_typet_t['geography_type'].str.get_dummies(sep=',')
geo_typet_t = pd.concat([geo_typet_t,y], axis = 1)
geo_typet_t = geo_typet_t.groupby(['index']).sum().reset_index()
geo_typet_t=geo_typet_t.drop(columns=['A','N'],axis=1)
test_df=pd.merge(test_df,geo_typet_t,left_on='index',right_on='index')
test_df=test_df.drop(columns=['geography_type'],axis=1)

In [11]:
#one hot encoding for urban_type

urb_typet=pd.DataFrame(train_df[['urban_type','index']])
urb_typet['urban_type'] = urb_typet['urban_type'].str.split(',')
urb_typet = urb_typet.explode('urban_type')
y = urb_typet['urban_type'].str.get_dummies(sep=',')
urb_typet = pd.concat([urb_typet,y], axis = 1)
urb_typet = urb_typet.groupby(['index']).sum().reset_index()
urb_typet=urb_typet.drop(columns=['A','N'],axis=1)
train_df=pd.merge(train_df,urb_typet,left_on='index',right_on='index')
train_df=train_df.drop(columns=['urban_type'],axis=1)

In [12]:
#one hot encoding for urban_type on test

urb_typet_t=pd.DataFrame(test_df[['urban_type','index']])
urb_typet_t['urban_type'] = urb_typet_t['urban_type'].str.split(',')
urb_typet_t = urb_typet_t.explode('urban_type')
y = urb_typet_t['urban_type'].str.get_dummies(sep=',')
urb_typet_t = pd.concat([urb_typet_t,y], axis = 1)
urb_typet_t = urb_typet_t.groupby(['index']).sum().reset_index()
urb_typet_t=urb_typet_t.drop(columns=['A','N'],axis=1)
test_df=pd.merge(test_df,urb_typet_t,left_on='index',right_on='index')
test_df=test_df.drop(columns=['urban_type'],axis=1)

In [13]:
print(train_df.shape)
print(test_df.shape)

(296146, 59)
(120526, 58)


In [14]:
savings_1_train=train_df.copy()
savings_1_test=test_df.copy()

Changer les dates en variables utilisables par python

In [15]:
#we change the date form in order to be able to use it

train_df=train_df.assign(date0_1=lambda x: x.date0.str.split('-'))
train_df=train_df.assign(date1_1=lambda x: x.date1.str.split('-'))
train_df=train_df.assign(date2_1=lambda x: x.date2.str.split('-'))
train_df=train_df.assign(date3_1=lambda x: x.date3.str.split('-'))
train_df=train_df.assign(date4_1=lambda x: x.date4.str.split('-'))

test_df=test_df.assign(date0_1=lambda x: x.date0.str.split('-'))
test_df=test_df.assign(date1_1=lambda x: x.date1.str.split('-'))
test_df=test_df.assign(date2_1=lambda x: x.date2.str.split('-'))
test_df=test_df.assign(date3_1=lambda x: x.date3.str.split('-'))
test_df=test_df.assign(date4_1=lambda x: x.date4.str.split('-'))


In [16]:
from datetime import datetime

def convert_list_date_to_conventional_date(L):
    if L==None:
        return None
    date_format = '%d/%m/%Y'
    date = datetime.strptime('/'.join(L), date_format)
    return date



In [17]:
train_df['date0_1']=train_df['date0_1'].apply(lambda x : convert_list_date_to_conventional_date(x))
train_df['date1_1']=train_df['date1_1'].apply(lambda x : convert_list_date_to_conventional_date(x))
train_df['date2_1']=train_df['date2_1'].apply(lambda x : convert_list_date_to_conventional_date(x))
train_df['date3_1']=train_df['date3_1'].apply(lambda x : convert_list_date_to_conventional_date(x))
train_df['date4_1']=train_df['date4_1'].apply(lambda x : convert_list_date_to_conventional_date(x))

test_df['date0_1']=test_df['date0_1'].apply(lambda x : convert_list_date_to_conventional_date(x))
test_df['date1_1']=test_df['date1_1'].apply(lambda x : convert_list_date_to_conventional_date(x))
test_df['date2_1']=test_df['date2_1'].apply(lambda x : convert_list_date_to_conventional_date(x))
test_df['date3_1']=test_df['date3_1'].apply(lambda x : convert_list_date_to_conventional_date(x))
test_df['date4_1']=test_df['date4_1'].apply(lambda x : convert_list_date_to_conventional_date(x))

#the time is now readable by python

In [18]:
train_df = train_df.drop(columns=['date0','date1','date2', 'date3', 'date4'], axis = 1)
test_df = test_df.drop(columns=['date0','date1','date2', 'date3', 'date4'], axis = 1)

In [19]:
train_df

Unnamed: 0,change_type,img_red_mean_date1,img_green_mean_date1,img_blue_mean_date1,img_red_std_date1,img_green_std_date1,img_blue_std_date1,img_red_mean_date2,img_green_mean_date2,img_blue_mean_date2,...,Dense Urban,Industrial,Rural,Sparse Urban,Urban Slum,date0_1,date1_1,date2_1,date3_1,date4_1
0,1,93.371775,107.291113,89.827379,29.812040,28.328368,25.324294,125.773062,139.833243,134.900701,...,0,0,0,1,0,2018-08-01,2013-12-09,2016-09-10,2019-07-22,2017-07-24
1,1,96.071674,107.061702,90.755556,24.896240,22.275180,22.080686,133.097679,145.385190,137.092518,...,0,0,0,1,0,2018-08-01,2013-12-09,2016-09-10,2019-07-22,2017-07-24
2,1,101.212148,113.462178,95.670574,24.179684,21.873401,21.285197,120.713490,131.633447,124.436492,...,0,0,0,1,0,2018-08-01,2013-12-09,2016-09-10,2019-07-22,2017-07-24
3,1,94.463311,99.995531,84.470046,26.869852,23.767679,19.351983,114.819776,127.827828,120.435373,...,0,0,1,0,0,2018-08-01,2013-12-09,2016-09-10,2019-07-22,2017-07-24
4,0,151.883646,191.710197,211.569244,52.465332,59.441844,52.304349,141.514462,171.079581,181.960612,...,1,0,0,0,0,2018-08-01,2013-12-09,2016-09-10,2019-07-22,2017-07-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296141,3,239.297084,229.193482,215.205832,25.969706,31.586712,32.155574,140.346141,116.700172,98.334477,...,0,0,0,0,0,2014-11-19,2017-02-25,2014-01-27,2018-03-28,2015-12-28
296142,2,162.912319,143.865217,122.935145,56.127846,44.184674,49.760802,103.760507,81.104710,70.001087,...,0,0,0,1,0,2014-11-19,2017-02-25,2014-01-27,2018-03-28,2015-12-28
296143,2,111.304320,94.723404,80.374597,21.540545,17.786801,18.143091,68.845906,62.948420,51.315925,...,0,0,0,1,0,2014-11-19,2017-02-25,2014-01-27,2018-03-28,2015-12-28
296144,2,137.374613,136.108359,113.544892,32.344779,30.077877,29.759516,98.718266,85.318885,72.572755,...,0,0,0,1,0,2014-11-19,2017-02-25,2014-01-27,2018-03-28,2015-12-28


In [20]:
#sort the date and images according to the date
def is_valid_timestamp(timestamp_element):
    return timestamp_element is not None and isinstance(timestamp_element, pd.Timestamp)

#for some lines their dates are not accessible, this is why we will use the function above 
for i in range(296146):
    #the index is different from the number of lines as some have been deleted, this is why we directly put the greatest index
    try:
        result = train_df.loc[i]
    except KeyError:
        result = None
        pass
    #some lines are not accessible and python gives an error
    if result is not None:
        if is_valid_timestamp(train_df['date0_1'][i]) and is_valid_timestamp(train_df['date1_1'][i]) and is_valid_timestamp(train_df['date2_1'][i]) and is_valid_timestamp(train_df['date3_1'][i]) and is_valid_timestamp(train_df['date4_1'][i]) :
            date=[train_df['date0_1'][i],train_df['date1_1'][i],train_df['date2_1'][i],train_df['date3_1'][i], train_df['date4_1'][i]]
            green=[train_df['img_green_mean_date1'][i],train_df['img_green_mean_date2'][i],train_df['img_green_mean_date3'][i],train_df['img_green_mean_date4'][i],train_df['img_green_mean_date5'][i]]
            red=[train_df['img_red_mean_date1'][i],train_df['img_red_mean_date2'][i], train_df['img_red_mean_date3'][i],train_df['img_red_mean_date4'][i],train_df['img_red_mean_date5'][i]]
            blue=[train_df['img_blue_mean_date1'][i],train_df['img_blue_mean_date2'][i],train_df['img_blue_mean_date3'][i],train_df['img_blue_mean_date4'][i],train_df['img_blue_mean_date5'][i]]
            green_std=[train_df['img_green_std_date1'][i],train_df['img_green_std_date2'][i],train_df['img_green_std_date3'][i],train_df['img_green_std_date4'][i],train_df['img_green_std_date5'][i]]
            red_std=[train_df['img_red_std_date1'][i],train_df['img_red_std_date2'][i],train_df['img_red_std_date3'][i],train_df['img_red_std_date4'][i],train_df['img_red_std_date5'][i]]
            blue_std=[train_df['img_blue_std_date1'][i],train_df['img_blue_std_date2'][i],train_df['img_blue_std_date3'][i],train_df['img_blue_std_date4'][i],train_df['img_blue_std_date5'][i]]
            status=[train_df['change_status_date0'][i],train_df['change_status_date1'][i],train_df['change_status_date2'][i],train_df['change_status_date3'][i],train_df['change_status_date4'][i]]

            sorted_timestamps = sorted(enumerate(date), key=lambda x: x[1])
            S_L=[index for index, timestamp in sorted_timestamps] #it gives the order of the argument according to the time, like argsort()

            train_df.loc[i,'date0_1'],train_df.loc[i,'date1_1'],train_df.loc[i,'date2_1'],train_df.loc[i,'date3_1'],train_df.loc[i,'date4_1']=date[S_L[0]],date[S_L[1]],date[S_L[2]], date[S_L[3]], date[S_L[4]]
            train_df.loc[i,'img_green_mean_date1'],train_df.loc[i,'img_green_mean_date2'],train_df.loc[i,'img_green_mean_date3'],train_df.loc[i,'img_green_mean_date4'],train_df.loc[i,'img_green_mean_date5'] =green[S_L[0]],green[S_L[1]],green[S_L[2]],green[S_L[3]],green[S_L[4]]
            train_df.loc[i,'img_red_mean_date1'],train_df.loc[i,'img_red_mean_date2'],train_df.loc[i,'img_red_mean_date3'],train_df.loc[i,'img_red_mean_date4'],train_df.loc[i,'img_red_mean_date5']=red[S_L[0]], red[S_L[1]], red[S_L[2]],red[S_L[3]],red[S_L[4]]
            train_df.loc[i,'img_blue_mean_date1'],train_df.loc[i,'img_blue_mean_date2'],train_df.loc[i,'img_blue_mean_date3'],train_df.loc[i,'img_blue_mean_date4'],train_df.loc[i,'img_blue_mean_date5']=blue[S_L[0]],blue[S_L[1]],blue[S_L[2]],blue[S_L[3]],blue[S_L[4]]
            train_df.loc[i,'change_status_date0'],train_df.loc[i,'change_status_date1'],train_df.loc[i,'change_status_date2'],train_df.loc[i,'change_status_date3'],train_df.loc[i,'change_status_date4']=status[S_L[0]], status[S_L[1]],status[S_L[2]], status[S_L[3]], status[S_L[4]]
            train_df.loc[i,'img_green_std_date1'],train_df.loc[i,'img_green_std_date2'],train_df.loc[i,'img_green_std_date3'],train_df.loc[i,'img_green_std_date4'],train_df.loc[i,'img_green_std_date5']=green_std[S_L[0]],green_std[S_L[1]],green_std[S_L[2]],green_std[S_L[3]],green_std[S_L[4]]
            train_df.loc[i,'img_red_std_date1'],train_df.loc[i,'img_red_std_date2'],train_df.loc[i,'img_red_std_date3'],train_df.loc[i,'img_red_std_date4'],train_df.loc[i,'img_red_std_date5']=red_std[S_L[0]], red_std[S_L[1]], red_std[S_L[2]], red_std[S_L[3]], red_std[S_L[4]]
            train_df.loc[i,'img_blue_std_date1'],train_df.loc[i,'img_blue_std_date2'],train_df.loc[i,'img_blue_std_date3'],train_df.loc[i,'img_blue_std_date4'],train_df.loc[i,'img_blue_std_date5']=blue_std[S_L[0]],blue_std[S_L[1]],blue_std[S_L[2]],blue_std[S_L[3]],blue_std[S_L[4]]

for i in range(120526):
    #the index is different from the number of lines as some have been deleted, this is why we directly put the greatest index
    try:
        result = test_df.loc[i]
    except KeyError:
        result = None
        pass
    #some lines are not accessible and python gives an error
    if result is not None:
        if is_valid_timestamp(test_df['date0_1'][i]) and is_valid_timestamp(test_df['date1_1'][i]) and is_valid_timestamp(test_df['date2_1'][i]) and is_valid_timestamp(test_df['date3_1'][i]) and is_valid_timestamp(test_df['date4_1'][i]) :
            date=[test_df['date0_1'][i],test_df['date1_1'][i],test_df['date2_1'][i],test_df['date3_1'][i], test_df['date4_1'][i]]
            green=[test_df['img_green_mean_date1'][i],test_df['img_green_mean_date2'][i],test_df['img_green_mean_date3'][i],test_df['img_green_mean_date4'][i],test_df['img_green_mean_date5'][i]]
            red=[test_df['img_red_mean_date1'][i],test_df['img_red_mean_date2'][i], test_df['img_red_mean_date3'][i],test_df['img_red_mean_date4'][i],test_df['img_red_mean_date5'][i]]
            blue=[test_df['img_blue_mean_date1'][i],test_df['img_blue_mean_date2'][i],test_df['img_blue_mean_date3'][i],test_df['img_blue_mean_date4'][i],test_df['img_blue_mean_date5'][i]]
            green_std=[test_df['img_green_std_date1'][i],test_df['img_green_std_date2'][i],test_df['img_green_std_date3'][i],test_df['img_green_std_date4'][i],test_df['img_green_std_date5'][i]]
            red_std=[test_df['img_red_std_date1'][i],test_df['img_red_std_date2'][i],test_df['img_red_std_date3'][i],test_df['img_red_std_date4'][i],test_df['img_red_std_date5'][i]]
            blue_std=[test_df['img_blue_std_date1'][i],test_df['img_blue_std_date2'][i],test_df['img_blue_std_date3'][i],test_df['img_blue_std_date4'][i],test_df['img_blue_std_date5'][i]]
            status=[test_df['change_status_date0'][i],test_df['change_status_date1'][i],test_df['change_status_date2'][i],test_df['change_status_date3'][i],test_df['change_status_date4'][i]]

            sorted_timestamps = sorted(enumerate(date), key=lambda x: x[1])
            S_L=[index for index, timestamp in sorted_timestamps] #it gives the order of the argument according to the time, like argsort()

            test_df.loc[i,'date0_1'],test_df.loc[i,'date1_1'],test_df.loc[i,'date2_1'],test_df.loc[i,'date3_1'],test_df.loc[i,'date4_1']=date[S_L[0]],date[S_L[1]],date[S_L[2]], date[S_L[3]], date[S_L[4]]
            test_df.loc[i,'img_green_mean_date1'],test_df.loc[i,'img_green_mean_date2'],test_df.loc[i,'img_green_mean_date3'],test_df.loc[i,'img_green_mean_date4'],test_df.loc[i,'img_green_mean_date5'] =green[S_L[0]],green[S_L[1]],green[S_L[2]],green[S_L[3]],green[S_L[4]]
            test_df.loc[i,'img_red_mean_date1'],test_df.loc[i,'img_red_mean_date2'],test_df.loc[i,'img_red_mean_date3'],test_df.loc[i,'img_red_mean_date4'],test_df.loc[i,'img_red_mean_date5']=red[S_L[0]], red[S_L[1]], red[S_L[2]],red[S_L[3]],red[S_L[4]]
            test_df.loc[i,'img_blue_mean_date1'],test_df.loc[i,'img_blue_mean_date2'],test_df.loc[i,'img_blue_mean_date3'],test_df.loc[i,'img_blue_mean_date4'],test_df.loc[i,'img_blue_mean_date5']=blue[S_L[0]],blue[S_L[1]],blue[S_L[2]],blue[S_L[3]],blue[S_L[4]]
            test_df.loc[i,'change_status_date0'],test_df.loc[i,'change_status_date1'],test_df.loc[i,'change_status_date2'],test_df.loc[i,'change_status_date3'],test_df.loc[i,'change_status_date4']=status[S_L[0]], status[S_L[1]],status[S_L[2]], status[S_L[3]], status[S_L[4]]
            test_df.loc[i,'img_green_std_date1'],test_df.loc[i,'img_green_std_date2'],test_df.loc[i,'img_green_std_date3'],test_df.loc[i,'img_green_std_date4'],test_df.loc[i,'img_green_std_date5']=green_std[S_L[0]],green_std[S_L[1]],green_std[S_L[2]],green_std[S_L[3]],green_std[S_L[4]]
            test_df.loc[i,'img_red_std_date1'],test_df.loc[i,'img_red_std_date2'],test_df.loc[i,'img_red_std_date3'],test_df.loc[i,'img_red_std_date4'],test_df.loc[i,'img_red_std_date5']=red_std[S_L[0]], red_std[S_L[1]], red_std[S_L[2]], red_std[S_L[3]], red_std[S_L[4]]
            test_df.loc[i,'img_blue_std_date1'],test_df.loc[i,'img_blue_std_date2'],test_df.loc[i,'img_blue_std_date3'],test_df.loc[i,'img_blue_std_date4'],test_df.loc[i,'img_blue_std_date5']=blue_std[S_L[0]],blue_std[S_L[1]],blue_std[S_L[2]],blue_std[S_L[3]],blue_std[S_L[4]]


# We ordered the images and change status by date of registration
# We believe tuning those features will help the training as the evolution of the land we be more easily compared
# It is time-consumming though, but is less than 13min

In [None]:
savings_1_train=train_df.copy()
savings_1_test=test_df.copy()

In [None]:
train_df=train_df.assign(temps_construction=lambda x: 0)

test_df=test_df.assign(temps_construction=lambda x: 0)

In [None]:
def time_construction(d1,d2): 
    dif=d2-d1
    return dif.days

#we compute the time of construction between the change status "Construction Done" the least recent and the change status before (different than "Construction Midway" to have the start of the construction)
for i in range(296146):
        try:
            result = train_df.loc[i]
        except KeyError:
            result = None
            pass
        if result is not None:
            if is_valid_timestamp(train_df['date0_1'][i]) and is_valid_timestamp(train_df['date1_1'][i]) and is_valid_timestamp(train_df['date2_1'][i]) and is_valid_timestamp(train_df['date3_1'][i]) and is_valid_timestamp(train_df['date4_1'][i]) :
                L=[train_df['change_status_date0'][i],train_df['change_status_date1'][i],train_df['change_status_date2'][i],train_df['change_status_date3'][i],train_df['change_status_date4'][i]]
                if L[1]=='Construction Done':
                    train_df.loc[i,'temps_construction']=time_construction(train_df['date0_1'][i],train_df['date1_1'][i])
                elif L[2]=='Construction Done':
                    if L[1]=='Construction Midway':
                        train_df.loc[i,'temps_construction']=time_construction(train_df['date0_1'][i],train_df['date2_1'][i])
                    else :
                        train_df.loc[i,'temps_construction']=time_construction(train_df['date1_1'][i],train_df['date2_1'][i])
                elif L[3]=='Construction Done':
                    if L[2]=='Construction Midway':
                        if L[1]=='Construction Midway':
                            train_df.loc[i,'temps_construction']=time_construction(train_df['date0_1'][i],train_df['date3_1'][i])
                        else:
                            train_df.loc[i,'temps_construction']=time_construction(train_df['date1_1'][i],train_df['date3_1'][i])
                    else :
                        train_df.loc[i,'temps_construction']=time_construction(train_df['date2_1'][i],train_df['date3_1'][i])
                elif L[4]=='Construction Done':
                    if L[3]=='Construction Midway':
                        if L[2]=='Construction Midway':
                            if L[1]=='Construction Midway':
                                train_df.loc[i,'temps_construction']=time_construction(train_df['date0_1'][i],train_df['date4_1'][i])
                            else :
                                train_df.loc[i,'temps_construction']=time_construction(train_df['date1_1'][i],train_df['date4_1'][i])
                        else :
                            train_df.loc[i,'temps_construction']=time_construction(train_df['date2_1'][i],train_df['date4_1'][i])
                    else :
                        train_df.loc[i,'temps_construction']=time_construction(train_df['date3_1'][i],train_df['date4_1'][i])
                else : 
                    train_df.loc[i,'temps_construction']=-1000


for i in range(120526):
        try:
            result = test_df.loc[i]
        except KeyError:
            result = None
            pass
        if result is not None:
            if is_valid_timestamp(test_df['date0_1'][i]) and is_valid_timestamp(test_df['date1_1'][i]) and is_valid_timestamp(test_df['date2_1'][i]) and is_valid_timestamp(test_df['date3_1'][i]) and is_valid_timestamp(test_df['date4_1'][i]) :
                L=[test_df['change_status_date0'][i],test_df['change_status_date1'][i],test_df['change_status_date2'][i],test_df['change_status_date3'][i],test_df['change_status_date4'][i]]
                if L[1]=='Construction Done':
                    test_df.loc[i,'temps_construction']=time_construction(test_df['date0_1'][i],test_df['date1_1'][i])
                elif L[2]=='Construction Done':
                    if L[1]=='Construction Midway':
                        test_df.loc[i,'temps_construction']=time_construction(test_df['date0_1'][i],test_df['date2_1'][i])
                    else :
                        test_df.loc[i,'temps_construction']=time_construction(test_df['date1_1'][i],test_df['date2_1'][i])
                elif L[3]=='Construction Done':
                    if L[2]=='Construction Midway':
                        if L[1]=='Construction Midway':
                            test_df.loc[i,'temps_construction']=time_construction(test_df['date0_1'][i],test_df['date3_1'][i])
                        else:
                            test_df.loc[i,'temps_construction']=time_construction(test_df['date1_1'][i],test_df['date3_1'][i])
                    else :
                        test_df.loc[i,'temps_construction']=time_construction(test_df['date2_1'][i],test_df['date3_1'][i])
                elif L[4]=='Construction Done':
                    if L[3]=='Construction Midway':
                        if L[2]=='Construction Midway':
                            if L[1]=='Construction Midway':
                                test_df.loc[i,'temps_construction']=time_construction(test_df['date0_1'][i],test_df['date4_1'][i])
                            else :
                                test_df.loc[i,'temps_construction']=time_construction(test_df['date1_1'][i],test_df['date4_1'][i])
                        else :
                            test_df.loc[i,'temps_construction']=time_construction(test_df['date2_1'][i],test_df['date4_1'][i])
                    else :
                        test_df.loc[i,'temps_construction']=time_construction(test_df['date3_1'][i],test_df['date4_1'][i])
                else : 
                    test_df.loc[i,'temps_construction']=-1000 
                    #on met un temps negatif pour les destructions et la plupart des temps de construction sont à 1000 jours 
# The time of this operation is generally 3min

train_df

Unnamed: 0,change_type,img_red_mean_date1,img_green_mean_date1,img_blue_mean_date1,img_red_std_date1,img_green_std_date1,img_blue_std_date1,img_red_mean_date2,img_green_mean_date2,img_blue_mean_date2,...,Industrial,Rural,Sparse Urban,Urban Slum,date0_1,date1_1,date2_1,date3_1,date4_1,temps_construction
0,1,125.773062,139.833243,134.900701,28.269984,28.264907,25.008032,150.766726,158.964529,149.356684,...,0,0,1,0,2013-12-09,2016-09-10,2017-07-24,2018-08-01,2019-07-22,690
1,1,133.097679,145.385190,137.092518,23.683964,21.651242,20.271657,184.480155,186.896779,174.235331,...,0,0,1,0,2013-12-09,2016-09-10,2017-07-24,2018-08-01,2019-07-22,1045
2,1,120.713490,131.633447,124.436492,28.951475,25.329365,22.505835,148.150431,158.490141,148.571269,...,0,0,1,0,2013-12-09,2016-09-10,2017-07-24,2018-08-01,2019-07-22,373
3,1,114.819776,127.827828,120.435373,34.091970,27.963117,23.901639,148.322747,156.855136,148.180798,...,0,1,0,0,2013-12-09,2016-09-10,2017-07-24,2018-08-01,2019-07-22,1045
4,0,141.514462,171.079581,181.960612,24.436771,34.297473,41.600845,170.365008,180.274159,186.818552,...,0,0,0,0,2013-12-09,2016-09-10,2017-07-24,2018-08-01,2019-07-22,-1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296141,3,211.421955,200.192453,177.962607,22.149322,21.775425,21.220923,239.297084,229.193482,215.205832,...,0,0,0,0,2014-01-27,2014-11-19,2015-12-28,2017-02-25,2018-03-28,425
296142,2,196.476812,183.499638,171.576449,36.244103,44.609578,49.829084,162.912319,143.865217,122.935145,...,0,0,1,0,2014-01-27,2014-11-19,2015-12-28,2017-02-25,2018-03-28,425
296143,2,68.845906,62.948420,51.315925,16.453913,13.275921,11.146906,111.304320,94.723404,80.374597,...,0,0,1,0,2014-01-27,2014-11-19,2015-12-28,2017-02-25,2018-03-28,425
296144,2,98.718266,85.318885,72.572755,29.719189,26.584532,22.268071,137.374613,136.108359,113.544892,...,0,0,1,0,2014-01-27,2014-11-19,2015-12-28,2017-02-25,2018-03-28,-1000


In [None]:
savings_2_test=test_df.copy()
savings_2_train=train_df.copy()

In [None]:
savings_2_test

Unnamed: 0,img_red_mean_date1,img_green_mean_date1,img_blue_mean_date1,img_red_std_date1,img_green_std_date1,img_blue_std_date1,img_red_mean_date2,img_green_mean_date2,img_blue_mean_date2,img_red_std_date2,...,Industrial,Rural,Sparse Urban,Urban Slum,date0_1,date1_1,date2_1,date3_1,date4_1,temps_construction
0,187.051282,158.300672,140.517298,24.176292,22.166941,22.843903,194.900285,158.205433,136.564611,29.425890,...,0,0,0,0,2014-03-09,2016-10-12,2018-02-20,2019-07-18,2020-07-06,-1000.0
1,155.780912,135.136855,121.112545,42.975509,45.455919,46.561138,159.233193,126.860744,109.391957,61.307663,...,0,1,0,0,2014-03-09,2016-10-12,2018-02-20,2019-07-18,2020-07-06,948.0
2,160.848178,145.442915,128.102024,29.936871,28.324675,29.503189,211.808502,175.314170,151.197571,5.005763,...,0,1,0,0,2014-03-09,2016-10-12,2018-02-20,2019-07-18,2020-07-06,1233.0
3,175.678701,169.652776,161.492208,23.635791,21.618166,24.392068,198.152955,176.199221,159.886006,26.622888,...,0,0,0,0,2014-03-09,2016-10-12,2018-02-20,2019-07-18,2020-07-06,948.0
4,149.589170,143.869542,137.122018,69.388245,69.465516,68.974486,86.675438,74.335521,70.386567,86.886427,...,0,0,0,0,2014-03-09,2016-10-12,2018-02-20,2019-07-18,2020-07-06,-1000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120521,74.594406,72.981199,62.822552,24.472312,19.897044,19.887582,150.686552,148.127120,139.252284,37.886951,...,0,0,0,0,2012-10-01,2013-12-19,2015-12-08,2018-07-15,2019-07-29,597.0
120522,58.439701,62.549749,55.780079,21.440513,17.952821,17.289851,130.269785,130.813081,126.755200,37.457960,...,0,0,0,0,2012-10-01,2013-12-19,2015-12-08,2018-07-15,2019-07-29,597.0
120523,42.641541,50.166332,49.730848,10.641932,9.309863,6.805194,92.761926,95.726859,97.593212,63.645377,...,0,0,0,0,2012-10-01,2013-12-19,2015-12-08,2018-07-15,2019-07-29,597.0
120524,47.581117,55.697246,49.512326,10.637095,9.854314,7.726375,87.433888,94.893424,95.866188,55.564793,...,0,0,1,0,2012-10-01,2013-12-19,2015-12-08,2018-07-15,2019-07-29,597.0


In [None]:
train_df=train_df.assign(t1_t0=lambda x: 0)
train_df=train_df.assign(t2_t1=lambda x: 0)
train_df=train_df.assign(t3_t2=lambda x: 0)
train_df=train_df.assign(t4_t3=lambda x: 0)

test_df=test_df.assign(t1_t0=lambda x: 0)
test_df=test_df.assign(t2_t1=lambda x: 0)
test_df=test_df.assign(t3_t2=lambda x: 0)
test_df=test_df.assign(t4_t3=lambda x: 0)
#as the time is hardly accessible especially for knn, we compute the difference between 2 dates (in days)

In [None]:
for i in range(train.shape[0]):
        try:
            result = train_df.loc[i]
        except KeyError:
            result = None
            pass
        if result is not None:
            if is_valid_timestamp(train_df['date0_1'][i]) and is_valid_timestamp(train_df['date1_1'][i]) and is_valid_timestamp(train_df['date2_1'][i]) and is_valid_timestamp(train_df['date3_1'][i]) and is_valid_timestamp(train_df['date4_1'][i]) :
                train_df.loc[i,'t1_t0']=time_construction(train_df['date0_1'][i],train_df['date1_1'][i])
                train_df.loc[i,'t2_t1']=time_construction(train_df['date1_1'][i],train_df['date2_1'][i])
                train_df.loc[i,'t3_t2']=time_construction(train_df['date2_1'][i],train_df['date3_1'][i])
                train_df.loc[i,'t4_t3']=time_construction(train_df['date3_1'][i],train_df['date4_1'][i])

for i in range(test.shape[0]):
        try:
            result = test_df.loc[i]
        except KeyError:
            result = None
            pass
        if result is not None:
            if is_valid_timestamp(test_df['date0_1'][i]) and is_valid_timestamp(test_df['date1_1'][i]) and is_valid_timestamp(test_df['date2_1'][i]) and is_valid_timestamp(test_df['date3_1'][i]) and is_valid_timestamp(test_df['date4_1'][i]) :
                test_df.loc[i,'t1_t0']=time_construction(test_df['date0_1'][i],test_df['date1_1'][i])
                test_df.loc[i,'t2_t1']=time_construction(test_df['date1_1'][i],test_df['date2_1'][i])
                test_df.loc[i,'t3_t2']=time_construction(test_df['date2_1'][i],test_df['date3_1'][i])
                test_df.loc[i,'t4_t3']=time_construction(test_df['date3_1'][i],test_df['date4_1'][i])
# Time of this operation is generally 3min

In [None]:
test_df.columns
train_df.columns

Index(['change_type', 'img_red_mean_date1', 'img_green_mean_date1',
       'img_blue_mean_date1', 'img_red_std_date1', 'img_green_std_date1',
       'img_blue_std_date1', 'img_red_mean_date2', 'img_green_mean_date2',
       'img_blue_mean_date2', 'img_red_std_date2', 'img_green_std_date2',
       'img_blue_std_date2', 'img_red_mean_date3', 'img_green_mean_date3',
       'img_blue_mean_date3', 'img_red_std_date3', 'img_green_std_date3',
       'img_blue_std_date3', 'img_red_mean_date4', 'img_green_mean_date4',
       'img_blue_mean_date4', 'img_red_std_date4', 'img_green_std_date4',
       'img_blue_std_date4', 'img_red_mean_date5', 'img_green_mean_date5',
       'img_blue_mean_date5', 'img_red_std_date5', 'img_green_std_date5',
       'img_blue_std_date5', 'change_status_date0', 'change_status_date1',
       'change_status_date2', 'change_status_date3', 'change_status_date4',
       'index', 'area', 'Barren Land', 'Coastal', 'Dense Forest', 'Desert',
       'Farms', 'Grass Land', 'Hill

In [None]:
train_df=train_df.assign(construction_speed=lambda x: 0)

test_df=test_df.assign(construction_speed=lambda x: 0)

In [None]:
for i in range(train_df.shape[0]):
        try:
            result = train_df.loc[i]
        except KeyError:
            result = None
            pass
        if result is not None:
            if is_valid_timestamp(train_df['date0_1'][i]) and is_valid_timestamp(train_df['date1_1'][i]) and is_valid_timestamp(train_df['date2_1'][i]) and is_valid_timestamp(train_df['date3_1'][i]) and is_valid_timestamp(train_df['date4_1'][i]) :
                if train_df.loc[i,'temps_construction']!=None:
                    train_df.loc[i,'construction_speed']=train_df.loc[i,'area']/train_df.loc[i,'temps_construction']
                else : 
                    train_df.loc[i,'construction_speed']=0

for i in range(test_df.shape[0]):
        try:
            result = test_df.loc[i]
        except KeyError:
            result = None
            pass
        if result is not None:
            if is_valid_timestamp(test_df['date0_1'][i]) and is_valid_timestamp(test_df['date1_1'][i]) and is_valid_timestamp(test_df['date2_1'][i]) and is_valid_timestamp(test_df['date3_1'][i]) and is_valid_timestamp(test_df['date4_1'][i]) :
                if test_df.loc[i,'temps_construction']!=None:
                    test_df.loc[i,'construction_speed']=test_df.loc[i,'area']/test_df.loc[i,'temps_construction']
                else : 
                    test_df.loc[i,'construction_speed']=0

In [None]:
savings_3_train=train_df.copy()
savings_3_test=test_df.copy()

In [None]:
train_df=savings_3_train.copy()
test_df=savings_3_test.copy()

In [None]:
#one hot encoding du premier etat et dernier etat sur train

change_status_0=pd.DataFrame(train_df[['change_status_date0','index']])
change_status_0['change_status_date0'] = change_status_0['change_status_date0'].str.split(',')
change_status_0 = change_status_0.explode('change_status_date0')
y = change_status_0['change_status_date0'].str.get_dummies(sep=',')
change_status_0 = pd.concat([change_status_0,y], axis = 1)
change_status_0 = change_status_0.groupby(['index']).sum().reset_index()
#change_status_0=change_status_0.drop(columns=['A','N'],axis=1)


train_df=pd.merge(train_df,change_status_0,left_on='index',right_on='index')
train_df=train_df.drop(columns=['change_status_date0'],axis=1)

change_status_4=pd.DataFrame(train_df[['change_status_date4','index']])
change_status_4['change_status_date4'] = change_status_4['change_status_date4'].str.split(',')
change_status_4 = change_status_4.explode('change_status_date4')
y = change_status_4['change_status_date4'].str.get_dummies(sep=',')
change_status_4 = pd.concat([change_status_4,y], axis = 1)
change_status_4 = change_status_4.groupby(['index']).sum().reset_index()
#change_status_4=change_status_4.drop(columns=['A','N'],axis=1)


train_df=pd.merge(train_df,change_status_4,left_on='index',right_on='index')
train_df=train_df.drop(columns=['change_status_date4'],axis=1)


In [None]:
train_df

Unnamed: 0,change_type,img_red_mean_date1,img_green_mean_date1,img_blue_mean_date1,img_red_std_date1,img_green_std_date1,img_blue_std_date1,img_red_mean_date2,img_green_mean_date2,img_blue_mean_date2,...,Construction Done_y,Construction Midway_y,Construction Started_y,Excavation_y,Greenland_y,Land Cleared_y,Materials Dumped_y,Materials Introduced_y,Operational_y,Prior Construction_y
0,1,125.773062,139.833243,134.900701,28.269984,28.264907,25.008032,150.766726,158.964529,149.356684,...,1,0,0,0,0,0,0,0,0,0
1,1,133.097679,145.385190,137.092518,23.683964,21.651242,20.271657,184.480155,186.896779,174.235331,...,1,0,0,0,0,0,0,0,0,0
2,1,120.713490,131.633447,124.436492,28.951475,25.329365,22.505835,148.150431,158.490141,148.571269,...,1,0,0,0,0,0,0,0,0,0
3,1,114.819776,127.827828,120.435373,34.091970,27.963117,23.901639,148.322747,156.855136,148.180798,...,1,0,0,0,0,0,0,0,0,0
4,0,141.514462,171.079581,181.960612,24.436771,34.297473,41.600845,170.365008,180.274159,186.818552,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296141,3,211.421955,200.192453,177.962607,22.149322,21.775425,21.220923,239.297084,229.193482,215.205832,...,1,0,0,0,0,0,0,0,0,0
296142,2,196.476812,183.499638,171.576449,36.244103,44.609578,49.829084,162.912319,143.865217,122.935145,...,1,0,0,0,0,0,0,0,0,0
296143,2,68.845906,62.948420,51.315925,16.453913,13.275921,11.146906,111.304320,94.723404,80.374597,...,1,0,0,0,0,0,0,0,0,0
296144,2,98.718266,85.318885,72.572755,29.719189,26.584532,22.268071,137.374613,136.108359,113.544892,...,0,1,0,0,0,0,0,0,0,0


In [None]:
#one hot encoding du premier etat et dernier etat sur test

change_status_t_0=pd.DataFrame(test_df[['change_status_date0','index']])
change_status_t_0['change_status_date0'] = change_status_t_0['change_status_date0'].str.split(',')
change_status_t_0 = change_status_t_0.explode('change_status_date0')
y = change_status_t_0['change_status_date0'].str.get_dummies(sep=',')
change_status_t_0 = pd.concat([change_status_t_0,y], axis = 1)
change_status_t_0 = change_status_t_0.groupby(['index']).sum().reset_index()
#change_status_t_0=change_status_t_0.drop(columns=['A','N'],axis=1)

test_df=pd.merge(test_df,change_status_t_0,left_on='index',right_on='index')
test_df=test_df.drop(columns=['change_status_date0'],axis=1)

change_status_t_4=pd.DataFrame(test_df[['change_status_date4','index']])
change_status_t_4['change_status_date4'] = change_status_t_4['change_status_date4'].str.split(',')
change_status_t_4 = change_status_t_4.explode('change_status_date4')
y = change_status_t_4['change_status_date4'].str.get_dummies(sep=',')
change_status_t_4 = pd.concat([change_status_t_4,y], axis = 1)
change_status_t_4 = change_status_t_4.groupby(['index']).sum().reset_index()
#change_status_t_4=change_status_t_4.drop(columns=['A','N'],axis=1)

change_status_4
test_df=pd.merge(test_df,change_status_t_4,left_on='index',right_on='index')
test_df=test_df.drop(columns=['change_status_date4'],axis=1)

Normalisation des données

In [None]:
savings_4_test=test_df.copy()
savings_4_train=train_df.copy()

In [None]:
train_df=savings_4_train.copy()
test_df=savings_4_test.copy()

In [None]:
train_df=train_df.dropna(axis=0,how='any')
test_df=test_df.dropna(axis=0,how='any')

In [None]:
normalize=pd.DataFrame(train_df[['img_red_mean_date1', 'img_green_mean_date1',
       'img_blue_mean_date1', 'img_red_std_date1', 'img_green_std_date1',
       'img_blue_std_date1', 'img_red_mean_date2', 'img_green_mean_date2',
       'img_blue_mean_date2', 'img_red_std_date2', 'img_green_std_date2',
       'img_blue_std_date2', 'img_red_mean_date3', 'img_green_mean_date3',
       'img_blue_mean_date3', 'img_red_std_date3', 'img_green_std_date3',
       'img_blue_std_date3', 'img_red_mean_date4', 'img_green_mean_date4',
       'img_blue_mean_date4', 'img_red_std_date4', 'img_green_std_date4',
       'img_blue_std_date4', 'img_red_mean_date5', 'img_green_mean_date5',
       'img_blue_mean_date5', 'img_red_std_date5', 'img_green_std_date5',
       'img_blue_std_date5', 'area', 'temps_construction', 't1_t0', 't2_t1', 't3_t2',
       't4_t3', 'construction_speed']])

normalize_t=pd.DataFrame(test_df[['img_red_mean_date1', 'img_green_mean_date1',
       'img_blue_mean_date1', 'img_red_std_date1', 'img_green_std_date1',
       'img_blue_std_date1', 'img_red_mean_date2', 'img_green_mean_date2',
       'img_blue_mean_date2', 'img_red_std_date2', 'img_green_std_date2',
       'img_blue_std_date2', 'img_red_mean_date3', 'img_green_mean_date3',
       'img_blue_mean_date3', 'img_red_std_date3', 'img_green_std_date3',
       'img_blue_std_date3', 'img_red_mean_date4', 'img_green_mean_date4',
       'img_blue_mean_date4', 'img_red_std_date4', 'img_green_std_date4',
       'img_blue_std_date4', 'img_red_mean_date5', 'img_green_mean_date5',
       'img_blue_mean_date5', 'img_red_std_date5', 'img_green_std_date5',
       'img_blue_std_date5', 'area', 'temps_construction', 't1_t0', 't2_t1', 't3_t2',
       't4_t3', 'construction_speed']])

In [None]:
normalize=(normalize-normalize.mean())/normalize.std()
normalize_t=(normalize_t-normalize_t.mean())/normalize_t.std()

In [None]:
normalize_t

Unnamed: 0,img_red_mean_date1,img_green_mean_date1,img_blue_mean_date1,img_red_std_date1,img_green_std_date1,img_blue_std_date1,img_red_mean_date2,img_green_mean_date2,img_blue_mean_date2,img_red_std_date2,...,img_red_std_date5,img_green_std_date5,img_blue_std_date5,area,temps_construction,t1_t0,t2_t1,t3_t2,t4_t3,construction_speed
0,1.796640,1.329608,0.959396,-0.316264,-0.351194,-0.241224,2.027456,1.313981,0.799351,0.145748,...,-0.736307,-0.614911,-0.454508,-0.011886,-1.132599,2.544541,0.398760,0.184524,-0.533063,0.002542
1,1.003167,0.677510,0.406440,0.949193,1.259350,1.439896,1.095495,0.408442,0.029989,2.285146,...,1.042914,1.388234,1.643228,-0.012546,1.020115,2.544541,0.398760,0.184524,-0.533063,0.003870
2,1.131747,0.967642,0.605612,0.071505,0.074643,0.230798,2.469258,1.808247,1.213666,-1.492943,...,-0.082004,-0.153069,-0.009336,-0.012621,1.335065,2.544541,0.398760,0.184524,-0.533063,0.003729
3,1.508065,1.649188,1.557095,-0.352648,-0.389145,-0.131488,2.112446,1.833816,1.459669,-0.042345,...,-0.603507,-0.377194,-0.209437,-0.008652,1.020115,2.544541,0.398760,0.184524,-0.533063,0.007973
4,0.846054,0.923349,0.862645,2.727150,2.919728,3.028595,-0.800398,-1.108995,-1.074403,4.001586,...,2.922313,3.403438,3.548961,-0.011920,-1.132599,2.544541,0.398760,0.184524,-0.533063,0.002577
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120521,-1.056908,-1.072270,-1.254585,-0.296338,-0.508169,-0.450774,0.872176,1.022821,0.875449,0.713520,...,0.751497,0.180193,0.081847,-0.009012,0.632229,-0.089392,1.903162,2.665991,-0.391909,0.009986
120522,-1.466828,-1.365932,-1.455267,-0.500422,-0.642621,-0.634906,0.338698,0.522624,0.521609,0.684733,...,0.922505,-0.012278,0.231573,-0.009507,0.632229,-0.089392,1.903162,2.665991,-0.391909,0.009157
120523,-1.867700,-1.714544,-1.627645,-1.227321,-1.240323,-1.378077,-0.641361,-0.491006,-0.304079,2.442016,...,2.036394,1.689922,1.163235,-0.009914,0.632229,-0.089392,1.903162,2.665991,-0.391909,0.008475
120524,-1.742360,-1.558840,-1.633872,-1.227647,-1.202671,-1.312782,-0.780580,-0.515084,-0.352978,1.899776,...,2.012923,1.376404,1.033291,-0.010223,0.632229,-0.089392,1.903162,2.665991,-0.391909,0.007958


In [None]:
train_df[['img_red_mean_date1', 'img_green_mean_date1',
       'img_blue_mean_date1', 'img_red_std_date1', 'img_green_std_date1',
       'img_blue_std_date1', 'img_red_mean_date2', 'img_green_mean_date2',
       'img_blue_mean_date2', 'img_red_std_date2', 'img_green_std_date2',
       'img_blue_std_date2', 'img_red_mean_date3', 'img_green_mean_date3',
       'img_blue_mean_date3', 'img_red_std_date3', 'img_green_std_date3',
       'img_blue_std_date3', 'img_red_mean_date4', 'img_green_mean_date4',
       'img_blue_mean_date4', 'img_red_std_date4', 'img_green_std_date4',
       'img_blue_std_date4', 'img_red_mean_date5', 'img_green_mean_date5',
       'img_blue_mean_date5', 'img_red_std_date5', 'img_green_std_date5',
       'img_blue_std_date5', 'area', 'temps_construction', 't1_t0', 't2_t1', 't3_t2',
       't4_t3', 'construction_speed']]=normalize[['img_red_mean_date1', 'img_green_mean_date1',
       'img_blue_mean_date1', 'img_red_std_date1', 'img_green_std_date1',
       'img_blue_std_date1', 'img_red_mean_date2', 'img_green_mean_date2',
       'img_blue_mean_date2', 'img_red_std_date2', 'img_green_std_date2',
       'img_blue_std_date2', 'img_red_mean_date3', 'img_green_mean_date3',
       'img_blue_mean_date3', 'img_red_std_date3', 'img_green_std_date3',
       'img_blue_std_date3', 'img_red_mean_date4', 'img_green_mean_date4',
       'img_blue_mean_date4', 'img_red_std_date4', 'img_green_std_date4',
       'img_blue_std_date4', 'img_red_mean_date5', 'img_green_mean_date5',
       'img_blue_mean_date5', 'img_red_std_date5', 'img_green_std_date5',
       'img_blue_std_date5', 'area', 'temps_construction', 't1_t0', 't2_t1', 't3_t2',
       't4_t3', 'construction_speed']]

test_df[['img_red_mean_date1', 'img_green_mean_date1',
       'img_blue_mean_date1', 'img_red_std_date1', 'img_green_std_date1',
       'img_blue_std_date1', 'img_red_mean_date2', 'img_green_mean_date2',
       'img_blue_mean_date2', 'img_red_std_date2', 'img_green_std_date2',
       'img_blue_std_date2', 'img_red_mean_date3', 'img_green_mean_date3',
       'img_blue_mean_date3', 'img_red_std_date3', 'img_green_std_date3',
       'img_blue_std_date3', 'img_red_mean_date4', 'img_green_mean_date4',
       'img_blue_mean_date4', 'img_red_std_date4', 'img_green_std_date4',
       'img_blue_std_date4', 'img_red_mean_date5', 'img_green_mean_date5',
       'img_blue_mean_date5', 'img_red_std_date5', 'img_green_std_date5',
       'img_blue_std_date5', 'area', 'temps_construction', 't1_t0', 't2_t1', 't3_t2',
       't4_t3', 'construction_speed']]=normalize_t[['img_red_mean_date1', 'img_green_mean_date1',
       'img_blue_mean_date1', 'img_red_std_date1', 'img_green_std_date1',
       'img_blue_std_date1', 'img_red_mean_date2', 'img_green_mean_date2',
       'img_blue_mean_date2', 'img_red_std_date2', 'img_green_std_date2',
       'img_blue_std_date2', 'img_red_mean_date3', 'img_green_mean_date3',
       'img_blue_mean_date3', 'img_red_std_date3', 'img_green_std_date3',
       'img_blue_std_date3', 'img_red_mean_date4', 'img_green_mean_date4',
       'img_blue_mean_date4', 'img_red_std_date4', 'img_green_std_date4',
       'img_blue_std_date4', 'img_red_mean_date5', 'img_green_mean_date5',
       'img_blue_mean_date5', 'img_red_std_date5', 'img_green_std_date5',
       'img_blue_std_date5', 'area', 'temps_construction', 't1_t0', 't2_t1', 't3_t2',
       't4_t3', 'construction_speed']]

In [None]:
test_df

Unnamed: 0,img_red_mean_date1,img_green_mean_date1,img_blue_mean_date1,img_red_std_date1,img_green_std_date1,img_blue_std_date1,img_red_mean_date2,img_green_mean_date2,img_blue_mean_date2,img_red_std_date2,...,Construction Done_y,Construction Midway_y,Construction Started_y,Excavation_y,Greenland_y,Land Cleared_y,Materials Dumped_y,Materials Introduced_y,Operational_y,Prior Construction_y
0,1.796640,1.329608,0.959396,-0.316264,-0.351194,-0.241224,2.027456,1.313981,0.799351,0.145748,...,0,0,0,0,0,0,0,1,0,0
1,1.003167,0.677510,0.406440,0.949193,1.259350,1.439896,1.095495,0.408442,0.029989,2.285146,...,1,0,0,0,0,0,0,0,0,0
2,1.131747,0.967642,0.605612,0.071505,0.074643,0.230798,2.469258,1.808247,1.213666,-1.492943,...,1,0,0,0,0,0,0,0,0,0
3,1.508065,1.649188,1.557095,-0.352648,-0.389145,-0.131488,2.112446,1.833816,1.459669,-0.042345,...,1,0,0,0,0,0,0,0,0,0
4,0.846054,0.923349,0.862645,2.727150,2.919728,3.028595,-0.800398,-1.108995,-1.074403,4.001586,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120521,-1.056908,-1.072270,-1.254585,-0.296338,-0.508169,-0.450774,0.872176,1.022821,0.875449,0.713520,...,1,0,0,0,0,0,0,0,0,0
120522,-1.466828,-1.365932,-1.455267,-0.500422,-0.642621,-0.634906,0.338698,0.522624,0.521609,0.684733,...,1,0,0,0,0,0,0,0,0,0
120523,-1.867700,-1.714544,-1.627645,-1.227321,-1.240323,-1.378077,-0.641361,-0.491006,-0.304079,2.442016,...,1,0,0,0,0,0,0,0,0,0
120524,-1.742360,-1.558840,-1.633872,-1.227647,-1.202671,-1.312782,-0.780580,-0.515084,-0.352978,1.899776,...,1,0,0,0,0,0,0,0,0,0


In [None]:
savings_5_train=train_df.copy()
savings_5_test=test_df.copy()

In [None]:
train_df=savings_5_train.copy()
test_df=savings_5_test.copy()


In [None]:
train_df=train_df.drop(columns=['construction_speed'])

Code kNN

In [None]:
#Définition des X_train and Y_train
train_df_random=train_df.sample(frac=1)
train_lines=train_df_random[:int(0.8*len(train_df))]
train_x = np.asarray(train_lines[['img_red_mean_date1', 'img_green_mean_date1',
       'img_blue_mean_date1', 'img_red_std_date1', 'img_green_std_date1',
       'img_blue_std_date1', 'img_red_mean_date2', 'img_green_mean_date2',
       'img_blue_mean_date2', 'img_red_std_date2', 'img_green_std_date2',
       'img_blue_std_date2', 'img_red_mean_date3', 'img_green_mean_date3',
       'img_blue_mean_date3', 'img_red_std_date3', 'img_green_std_date3',
       'img_blue_std_date3', 'img_red_mean_date4', 'img_green_mean_date4',
       'img_blue_mean_date4', 'img_red_std_date4', 'img_green_std_date4',
       'img_blue_std_date4', 'img_red_mean_date5', 'img_green_mean_date5',
       'img_blue_mean_date5', 'img_red_std_date5', 'img_green_std_date5',
       'img_blue_std_date5', 'area', 'Barren Land', 'Coastal',
       'Dense Forest', 'Desert', 'Farms', 'Grass Land', 'Hills', 'Lakes',
       'River', 'Snow', 'Sparse Forest', 'Dense Urban', 'Industrial', 'Rural',
       'Sparse Urban', 'Urban Slum','temps_construction', 't1_t0', 't2_t1', 't3_t2',
       't4_t3', 'Construction Done_x',
       'Construction Midway_x', 'Construction Started_x', 'Excavation_x',
       'Greenland_x', 'Land Cleared_x', 'Materials Dumped_x',
       'Materials Introduced_x', 'Operational_x', 'Prior Construction_x',
       'Construction Done_y', 'Construction Midway_y',
       'Construction Started_y', 'Excavation_y', 'Greenland_y',
       'Land Cleared_y', 'Materials Dumped_y', 'Materials Introduced_y',
       'Operational_y', 'Prior Construction_y']])

train_y = np.asarray(train_lines[['change_type']])

#Définition des X_test et Y_test
test_lines=train_df_random[int(0.8*len(train_df)):]

test_x=np.asarray(test_lines[['img_red_mean_date1', 'img_green_mean_date1',
       'img_blue_mean_date1', 'img_red_std_date1', 'img_green_std_date1',
       'img_blue_std_date1', 'img_red_mean_date2', 'img_green_mean_date2',
       'img_blue_mean_date2', 'img_red_std_date2', 'img_green_std_date2',
       'img_blue_std_date2', 'img_red_mean_date3', 'img_green_mean_date3',
       'img_blue_mean_date3', 'img_red_std_date3', 'img_green_std_date3',
       'img_blue_std_date3', 'img_red_mean_date4', 'img_green_mean_date4',
       'img_blue_mean_date4', 'img_red_std_date4', 'img_green_std_date4',
       'img_blue_std_date4', 'img_red_mean_date5', 'img_green_mean_date5',
       'img_blue_mean_date5', 'img_red_std_date5', 'img_green_std_date5',
       'img_blue_std_date5', 'area', 'Barren Land', 'Coastal',
       'Dense Forest', 'Desert', 'Farms', 'Grass Land', 'Hills', 'Lakes',
       'River', 'Snow', 'Sparse Forest', 'Dense Urban', 'Industrial', 'Rural',
       'Sparse Urban', 'Urban Slum','temps_construction', 't1_t0', 't2_t1', 't3_t2',
       't4_t3', 'Construction Done_x',
       'Construction Midway_x', 'Construction Started_x', 'Excavation_x',
       'Greenland_x', 'Land Cleared_x', 'Materials Dumped_x',
       'Materials Introduced_x', 'Operational_x', 'Prior Construction_x',
       'Construction Done_y', 'Construction Midway_y',
       'Construction Started_y', 'Excavation_y', 'Greenland_y',
       'Land Cleared_y', 'Materials Dumped_y', 'Materials Introduced_y',
       'Operational_y', 'Prior Construction_y']])

test_y = np.asarray(test_lines[['change_type']])


In [None]:
#kNN et test avec entrainement sur 80% de la data et test sur les autres 20%

## Train a simple OnveVsRestClassifier using featurized data
for k in range(5,30,5):
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(train_x, train_y)
    pred_y = neigh.predict(test_x)
    print(accuracy_score(test_y,pred_y),k)





  return self._fit(X, y)


0.6822482579587376 5


  return self._fit(X, y)


0.6890285558136358 10


  return self._fit(X, y)


0.6874573029102337 15


  return self._fit(X, y)


0.6858689711709249 20


  return self._fit(X, y)


0.6823848886459899 25


In [None]:
print(accuracy_score(test_y,pred_y))

0.6823848886459899


In [None]:
#entrainment kNN avec toute la data

X_train=np.asarray(train_df[['area', 'Barren Land', 'Coastal', 'Dense Forest', 'Desert', 'Farms', 'Grass Land', 'Hills',
       'Lakes', 'River', 'Snow', 'Sparse Forest', 'Dense Urban', 'Industrial',
       'Rural', 'Sparse Urban', 'Urban Slum']])

Y_train = np.asarray(train_df[['change_type']])

X_test=np.asarray(test_df[['area', 'Barren Land', 'Coastal', 'Dense Forest', 'Desert', 'Farms', 'Grass Land', 'Hills',
       'Lakes', 'River', 'Snow', 'Sparse Forest', 'Dense Urban', 'Industrial',
       'Rural', 'Sparse Urban', 'Urban Slum']])

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, Y_train)
Y_pred = neigh.predict(X_test)





  return self._fit(X, y)


In [None]:
Y_pred.to_csv(kNN, index=False)

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'