# Preprocessing of data

### Importings

In [2]:
import pandas as pd
import matplotlib.pyplot as pyplot
import datetime

### Print length of dataset and first 10 rows of the data

In [3]:
data = pd.read_csv('train.csv', nrows=50)
originalLength = len(data)

In [4]:
print("Original størrelse på datasettet:")
print(originalLength)
data.head(10)

Original størrelse på datasettet:
50


Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."
5,1372636965620000231,C,,,20000231,1372636965,A,False,"[[-8.615502,41.140674],[-8.614854,41.140926],[..."
6,1372637210620000456,C,,,20000456,1372637210,A,False,"[[-8.57952,41.145948],[-8.580942,41.145039],[-..."
7,1372637299620000011,C,,,20000011,1372637299,A,False,"[[-8.617563,41.146182],[-8.617527,41.145849],[..."
8,1372637274620000403,C,,,20000403,1372637274,A,False,"[[-8.611794,41.140557],[-8.611785,41.140575],[..."
9,1372637905620000320,C,,,20000320,1372637905,A,False,"[[-8.615907,41.140557],[-8.614449,41.141088],[..."


### Removing unwanted rows from dataset

Remove all columns with MISSING_DATA=True. Because we only want the trips that do not miss any datapoint

In [5]:
data = data[data['MISSING_DATA']!=True]

In [6]:
print("New length of dataset:")
newLength1 = len(data)
print(newLength1)

print("Number of removed rows:")
print(originalLength - len(data))

New length of dataset:
50
Number of removed rows:
0


### Removing columns that we do not need anymore: 
ORIGIN_CALL, ORIGIN_STAND, TAXI_ID and MISSING_DATA (already used the info we needed from this column).

In [7]:
data.drop(['ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'MISSING_DATA'], axis=1, inplace=True)
data.head(10)

Unnamed: 0,TRIP_ID,CALL_TYPE,TIMESTAMP,DAY_TYPE,POLYLINE
0,1372636858620000589,C,1372636858,A,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,1372637303,A,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,1372636951,A,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,1372636854,A,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,1372637091,A,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."
5,1372636965620000231,C,1372636965,A,"[[-8.615502,41.140674],[-8.614854,41.140926],[..."
6,1372637210620000456,C,1372637210,A,"[[-8.57952,41.145948],[-8.580942,41.145039],[-..."
7,1372637299620000011,C,1372637299,A,"[[-8.617563,41.146182],[-8.617527,41.145849],[..."
8,1372637274620000403,C,1372637274,A,"[[-8.611794,41.140557],[-8.611785,41.140575],[..."
9,1372637905620000320,C,1372637905,A,"[[-8.615907,41.140557],[-8.614449,41.141088],[..."


# Create subsets and check if they are good enough

## Saving "expected" values. The number of rows from each call type

In [8]:
#finding the number of rows of each call type (A, B, C) in the original dataset
original_typea_count = len(data[data['CALL_TYPE']=='A'])
original_typeb_count = len(data[data['CALL_TYPE']=='B'])
original_typec_count = len(data[data['CALL_TYPE']=='C'])

In [9]:
#calculate the schi_square value given input which is the count from a subset. 
#Then calculate the value of the schi_square of the subset
def chi_square_call_type(type_a, type_b, type_c):
    partA = ((type_a - original_typea_count)^2)/original_typea_count
    partB = ((type_b - original_typeb_count)^2)/original_typeb_count
    partC = ((type_c - original_typec_count)^2)/original_typec_count
    chi_square = partA + partB + partC
    return chi_square

In [11]:
def get_info_from_timestamps(subset):
    hours_dict = {
        0:0,
        1:0,
        2:0,
        3:0,
        4:0,
        5:0,
        6:0,
        7:0,
        8:0,
        9:0,
        10:0,
        11:0,
        12:0,
        13:0,
        14:0,
        15:0,
        16:0,
        17:0,
        18:0,
        19:0,
        20:0,
        21:0,
        22:0,
        23:0
    }
    days_dict={
        "Monday":0,
        "Tuesday":0,
        "Wednesday":0,
        "Thursday":0,
        "Friday":0,
        "Saturday":0,
        "Sunday":0
    }
    months_dict={
        1:0,
        2:0,
        3:0,
        4:0,
        5:0,
        6:0,
        7:0,
        8:0,
        9:0,
        10:0,
        11:0,
        12:0
    }
    
    for row in subset['DATETIME']:
        time = datetime.datetime.fromtimestampp(row)
        hours_dict[time.hour]+=1
        days_dict[time.strftime("%A")]
        months_dict[time.month]+=1
    return hours_dict, days_dict, months_dict

### Saving the timestamp-info from original data

In [10]:
#finding the number of rows of each hour, weekday and month from timestamp in the original dataset
original_hours_dict, original_weekdays_dict, original_months_dict = get_info_from_timestamps(data)


###

### Printing the histograms and calculating chi-square of each variable, both for the original dataset and the subset.

CALL_TYPE:

In [12]:
data_sorted = data.sort_values(by='CALL_TYPE')
subset1_sorted = subset1.sort_values(by='CALL_TYPE')

pyplot.hist(data_sorted['CALL_TYPE'], bins='auto')
pyplot.xlabel('Call type')
pyplot.ylabel('number of rows')
pyplot.title('Original dataset. CALL_TYPE')
pyplot.show()

pyplot.hist(subset1_sorted['CALL_TYPE'], bins='auto')
pyplot.xlabel('Call type')
pyplot.ylabel('number of rows')
pyplot.title('Subset1, CALL_TYPE')
pyplot.show()

NameError: name 'subset1' is not defined

In [None]:
#regne ut tall for CALL_TYPE
# if tall>threshold
    #regne ut tall for ....
    #if tall>threshold
        #regne ut....
#else
#velge nye random trajectories