In [1]:
import pandas as pd
import numpy as np
import random as rnd
import matplotlib.pyplot as plt
import scipy
import math
from scipy import stats
from scipy import spatial
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq,df)
import statsmodels.api as sm

  from pandas.core import datetools


In [2]:
AirQualityCat = ['air-qual-good','air-qual-moderate','air-qual-unhealthy','air-qual-hazardous']
WeatherCat = ['weather-rainy','weather-stormy','weather-sunny','weather-cloudy','weather-hot','weather-cold','weather-dry','weather-wet','weather-windy','weather-snow']
TrafficCat = ['traffic-low','traffic-moderate','traffic-high','traffic-worse']
DayPartCat = ['dt-early-morning','dt-morning','dt-noon','dt-afternoon','dt-night','dt-late-night']
BikeLanesCat = ['bl-none','bl-partial','bl-full']
TemperatureCat = ['temp-0-10','temp-10-20','temp-20-30','temp-30-40','temp-40-50','temp-50-60','temp-60-70','temp-70-80','temp-80-90','temp-90-100']
DistanceCat = ['dist-1-5','dist-5-10','dist-10-15','dist-15-20','dist-20-25','dist-25-30','dist-30-35']
TimeCat = ['time-1-10','time-10-20','time-20-30','time-30-40','time-40-50','time-50-60']
ModesCat = ['mode-car','mode-transit','mode-bike','mode-mt-bike','mode-walk']

In [3]:
CatCombined = []
CatCombined.extend(AirQualityCat)
CatCombined.extend(WeatherCat)
CatCombined.extend(TrafficCat)
CatCombined.extend(DayPartCat)
CatCombined.extend(BikeLanesCat)
CatCombined.extend(TemperatureCat)
CatCombined.extend(DistanceCat)
CatCombined.extend(TimeCat)

In [4]:
IndCombined = []
IndCombined.extend(CatCombined)
CatCombined.extend(ModesCat)

In [12]:

def ConvertTemperature(intValue,tempCategories):
    outArr = [0]*len(tempCategories)
    myVal = math.floor(intValue/len(tempCategories) - 1)
    outArr[myVal] = 1
    return outArr

def ConvertAirQuality(intValue,aqCategories):
    outArr = [0]*len(aqCategories)
    myVal = math.ceil((intValue / 2.5) - 1)
    outArr[myVal] = 1
    return outArr

def ComputePearson(indName,colName,dFrame):
    firstElement = dFrame.loc[indName,colName]
    secondElement = dFrame.drop(colName,axis=1).loc[indName,:].sum()
    thirdElement = dFrame.drop(indName).loc[:,colName].sum()
    fourthElement = dFrame.drop(indName).drop(colName,axis=1).sum(axis=1).sum()
    varPearson = ((firstElement*fourthElement) - (thirdElement*secondElement))/math.sqrt((firstElement+thirdElement)*(secondElement+fourthElement)*(firstElement+secondElement)*(thirdElement+fourthElement))
    return varPearson

def ComputeDistance(distance):
    if distance >=1 and distance <=5:
        return 'dist-1-5'
    elif distance > 5 and distance <= 10: 
        return 'dist-5-10'
    elif distance > 10 and distance <= 15:
        return 'dist-10-15'
    elif distance > 15 and distance <= 20:
        return 'dist-15-20'
    elif distance > 20 and distance <= 25:
        return 'dist-20-25'
    elif distance > 25 and distance <= 30:
        return 'dist-25-30'
    elif distance > 30 and distance <= 35:
        return 'dist-30-35'
    
def ComputeTime(time):
    if time >= 1 and time <= 10:
        return 'time-1-10'
    elif time > 10 and time <= 20:
        return 'time-10-20'
    elif time > 20 and time <= 30:
        return 'time-20-30'
    elif time > 30 and time <= 40:
        return 'time-30-40'
    elif time > 40 and time <= 50:
        return 'time-40-50'
    elif time > 50 and time <= 60:
        return 'time-50-60'
    
def ComputeTemperature(temperature):
    if temperature >= 0 and temperature <= 10:
        return 'temp-0-10'
    elif temperature > 10 and temperature <= 20:
        return 'temp-10-20'
    elif temperature > 20 and temperature <= 30:
        return 'temp-20-30'
    elif temperature > 30 and temperature <= 40:
        return 'temp-30-40'
    elif temperature > 40 and temperature <= 50:
        return 'temp-40-50'
    elif temperature > 50 and temperature <= 60:
        return 'temp-50-60'
    elif temperature > 60 and temperature <= 70:
        return 'temp-60-70'
    elif temperature > 70 and temperature <= 80:
        return 'temp-70-80'
    elif temperature > 80 and temperature <= 90:
        return 'temp-80-90'
    elif temperature > 90 and temperature <= 100:
        return 'temp-90-100'
    
def ComputeAirQuality(airQuality):
    if(airQuality >=1 and airQuality <= 2):
        return 'air-qual-good'
    elif(airQuality > 2 and airQuality <= 5):
        return 'air-qual-moderate'
    elif(airQuality > 5 and airQuality <= 7):
        return 'air-qual-unhealthy'
    elif(airQuality > 7 and airQuality <= 10):
        return 'air-qual-hazardous'
    
    
def GetMaxIndex(tVector):
    myMax = tVector[0]
    myMaxIndex = 0
    iter = 0;
    for item in tVector:
        if item > myMax:
            myMax = item
            myMaxIndex = iter
        iter = iter + 1
    return myMaxIndex

    

In [15]:

# filesArr = ['User1.xlsx','User2.xlsx','User3.xlsx','User4.xlsx','User5.xlsx']
filesArr = ['User6.xlsx']
foldPath = 'UserProfiles/'
for strFile in filesArr:
    xFile = pd.ExcelFile(foldPath + strFile)
    df = xFile.parse('data')
    dataArr = np.ones((len(IndCombined),len(ModesCat)))
    for i in range(len(df.index)):
        freshVal = int(df.iloc[i]['Freshness'])
        freshVal = freshVal + 50
#         tAirQual = ConvertAirQuality(int(df.iloc[i]['air-quality']),AirQualityCat)
        tAirQual = ComputeAirQuality(int(df.iloc[i]['air-quality']))
        tWeather = df.iloc[i]['weather']
        tTraffic = df.iloc[i]['traffic']
        tDayTime = df.iloc[i]['day-time']
        tBikeLanes = df.iloc[i]['bike-lane']
        tTemperature = ComputeTemperature(df.iloc[i]['temperature'])
        tDistance = ComputeDistance(df.iloc[i]['distance'])
        tTime = ComputeTime(df.iloc[i]['time'])
        tMode = df.iloc[i]['mode']
        tModeIndex = ModesCat.index(tMode);
        tAirQualIndex = tAirQual.index(tAirQual)
        tWeatherIndex = IndCombined.index(tWeather)
        tTrafficIndex = IndCombined.index(tTraffic)
        tDayTimeIndex = IndCombined.index(tDayTime)
        tBikeLanesIndex = IndCombined.index(tBikeLanes)
        tDistanceIndex = IndCombined.index(tDistance)
        tTemperatureIndex = IndCombined.index(tTemperature)
        tTimeIndex = IndCombined.index(tTime)

        dataArr[tAirQualIndex][tModeIndex] =  dataArr[tAirQualIndex][tModeIndex] + freshVal + 1
        dataArr[tWeatherIndex][tModeIndex] =  dataArr[tWeatherIndex][tModeIndex] + freshVal + 1
        dataArr[tTrafficIndex][tModeIndex] =  dataArr[tTrafficIndex][tModeIndex] + freshVal + 1
        dataArr[tDayTimeIndex][tModeIndex] =  dataArr[tDayTimeIndex][tModeIndex] + freshVal + 1
        dataArr[tBikeLanesIndex][tModeIndex] =  dataArr[tBikeLanesIndex][tModeIndex] + freshVal + 1
        dataArr[tTemperatureIndex][tModeIndex] =  dataArr[tTemperatureIndex][tModeIndex] + freshVal + 1
        dataArr[tDistanceIndex][tModeIndex] = dataArr[tDistanceIndex][tModeIndex] + freshVal + 1
        dataArr[tTimeIndex][tModeIndex] = dataArr[tTimeIndex][tModeIndex] + freshVal + 1

        tdf = pd.DataFrame(dataArr,index=IndCombined,columns=ModesCat)
        tArr = RankMode([tAirQual,tWeather,tTraffic,tDayTime,tBikeLanes,tTemperature,tDistance,tTime],tdf.columns.values,tdf)
        tIndex = GetMaxIndex(tArr)
    #     dataArr[tAirQualIndex][tModeIndex] =  dataArr[tAirQualIndex][tModeIndex] + 1
    #     dataArr[tWeatherIndex][tModeIndex] =  dataArr[tAirQualIndex][tModeIndex] + 1
    #     dataArr[tTrafficIndex][tModeIndex] =  dataArr[tAirQualIndex][tModeIndex] + 1
    #     dataArr[tDayTimeIndex][tModeIndex] =  dataArr[tAirQualIndex][tModeIndex] + 1
    #     dataArr[tBikeLanesIndex][tModeIndex] =  dataArr[tAirQualIndex][tModeIndex] + 1
    #     dataArr[-len(TemperatureCat) + tTemperature.index(1)][tModeIndex] =  dataArr[tAirQualIndex][tModeIndex] + 1
        print(ModesCat[tIndex])
    
    mydf = pd.DataFrame(dataArr,index=IndCombined,columns=ModesCat)     

    #print(mydf.loc['air-qual-good','mode-transit'])

    idrop = mydf.drop('mode-car',axis=1).loc['air-qual-good',:].sum()
    #odrop = idrop.drop('mode-car',axis=1)

    print(mydf)


mode-bike
mode-transit
mode-bike
mode-mt-bike
mode-bike
mode-car
mode-car
mode-mt-bike
mode-car
mode-walk
mode-transit
mode-transit
mode-bike
mode-car
mode-transit
mode-bike
mode-mt-bike
mode-transit
mode-bike
mode-transit
mode-walk
mode-walk
mode-walk
mode-car
mode-bike
mode-bike
mode-mt-bike
mode-mt-bike
mode-walk
mode-walk
mode-car
mode-car
mode-mt-bike
mode-walk
mode-walk
mode-car
mode-car
mode-car
mode-walk
mode-walk
mode-car
mode-mt-bike
mode-mt-bike
mode-car
mode-car
mode-bike
mode-car
mode-car
mode-transit
mode-mt-bike
mode-transit
mode-car
mode-car
mode-mt-bike
mode-walk
mode-bike
mode-transit
mode-car
mode-transit
mode-transit
mode-transit
mode-car
mode-car
mode-walk
mode-walk
mode-car
mode-walk
mode-bike
mode-car
mode-transit
mode-walk
mode-bike
mode-transit
mode-car
mode-car
mode-bike
mode-transit
mode-transit
mode-walk
mode-transit
mode-car
mode-walk
mode-walk
mode-car
mode-car
mode-walk
mode-walk
mode-mt-bike
mode-car
mode-car
mode-transit
mode-car
mode-walk
mode-transit


In [56]:
ComputePearson('air-qual-good','mode-car',mydf)

-0.0016271009390274366

In [7]:
def RankMode(indNames,colNames,dataFrame):
    scoreArray = [0]*len(colNames)
    for i in range(len(colNames)):
        score = 0
        for j in range(len(indNames)):
            pVal = ComputePearson(indNames[j],colNames[i],dataFrame)
            score = score + pVal * (dataFrame.loc[indNames[j],colNames[i]])
        scoreArray[i] = score
    return scoreArray

In [58]:
print(mydf.columns.values)

['mode-car' 'mode-transit' 'mode-bike' 'mode-mt-bike' 'mode-walk']


In [61]:
print(RankMode(["air-qual-hazardous","weather-windy","traffic-high","dt-afternoon","temp-20-30"],mydf.columns.values,mydf))

tArr = RankMode(["air-qual-hazardous","weather-windy","traffic-high","dt-afternoon","temp-20-30"],mydf.columns.values,mydf)

tMode = GetMaxIndex(tArr)

tMode

[7.54524744355473, 38.192888493289054, -2.0113492470212964, -22.5710242633174, 8.44690966536225]


1

In [103]:
dataset1 = [1,0,1,1,1,0,1,0,1,1,0,1]
dataset2 = [1,1,1,1,1,1,1,0,1,1,0,1]

print(spatial.distance.cosine(dataset1,dataset2))

0.10557280900008414
