In [1]:
from path import Path
import pandas as pd
from collections import Counter

In [2]:
data = Path('data/walkability.csv')
walkability_df = pd.read_csv(data)
walkability_df.head()

Unnamed: 0,CBSA_POP,CBSA_EMP,CBSA_WRK,Ac_Total,Ac_Water,Ac_Land,Ac_Unpr,TotPop,CountHU,HH,...,D5CEI,D5DR,D5DRI,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd
0,7189384,3545715,3364458,73.595028,0.0,73.595028,73.595028,1202,460.0,423.0,...,0.841299,0.000525,0.184697,0.000476,0.137707,6,14,15,17,14.0
1,7189384,3545715,3364458,119.829909,0.0,119.829909,119.2142,710,409.0,409.0,...,0.753958,0.000919,0.323221,0.000801,0.231868,3,10,12,14,10.833333
2,7189384,3545715,3364458,26.367053,0.0,26.367053,26.36705,737,365.0,329.0,...,0.800475,0.000894,0.314628,0.000736,0.213146,1,1,7,17,8.333333
3,7189384,3545715,3364458,119.060687,0.0,119.060687,119.060687,904,384.0,384.0,...,0.825778,0.000653,0.229821,0.000708,0.205018,16,10,17,17,15.666667
4,7189384,3545715,3364458,169.927211,0.0,169.927211,148.74292,948,343.0,343.0,...,0.659846,0.000469,0.164863,0.000433,0.125296,4,7,11,14,10.166667


In [3]:
walkability_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167265 entries, 0 to 167264
Columns: 104 entries, CBSA_POP to NatWalkInd
dtypes: float64(67), int64(37)
memory usage: 132.7 MB


## Encoding

In [4]:
walkability_df['NatWalkInd'] = walkability_df['NatWalkInd'].apply(lambda x: 'walkable' if x > 10 else 'non_walkable')
walkability_df.head()

Unnamed: 0,CBSA_POP,CBSA_EMP,CBSA_WRK,Ac_Total,Ac_Water,Ac_Land,Ac_Unpr,TotPop,CountHU,HH,...,D5CEI,D5DR,D5DRI,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd
0,7189384,3545715,3364458,73.595028,0.0,73.595028,73.595028,1202,460.0,423.0,...,0.841299,0.000525,0.184697,0.000476,0.137707,6,14,15,17,walkable
1,7189384,3545715,3364458,119.829909,0.0,119.829909,119.2142,710,409.0,409.0,...,0.753958,0.000919,0.323221,0.000801,0.231868,3,10,12,14,walkable
2,7189384,3545715,3364458,26.367053,0.0,26.367053,26.36705,737,365.0,329.0,...,0.800475,0.000894,0.314628,0.000736,0.213146,1,1,7,17,non_walkable
3,7189384,3545715,3364458,119.060687,0.0,119.060687,119.060687,904,384.0,384.0,...,0.825778,0.000653,0.229821,0.000708,0.205018,16,10,17,17,walkable
4,7189384,3545715,3364458,169.927211,0.0,169.927211,148.74292,948,343.0,343.0,...,0.659846,0.000469,0.164863,0.000433,0.125296,4,7,11,14,walkable


In [5]:
# Check balance of target
class_counts = walkability_df['NatWalkInd'].value_counts()
class_counts

walkable        86486
non_walkable    80779
Name: NatWalkInd, dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
walkability_df = walkability_df.copy()
walkability_df['NatWalkInd'] = le.fit_transform(walkability_df['NatWalkInd']) 
walkability_df.head()

Unnamed: 0,CBSA_POP,CBSA_EMP,CBSA_WRK,Ac_Total,Ac_Water,Ac_Land,Ac_Unpr,TotPop,CountHU,HH,...,D5CEI,D5DR,D5DRI,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd
0,7189384,3545715,3364458,73.595028,0.0,73.595028,73.595028,1202,460.0,423.0,...,0.841299,0.000525,0.184697,0.000476,0.137707,6,14,15,17,1
1,7189384,3545715,3364458,119.829909,0.0,119.829909,119.2142,710,409.0,409.0,...,0.753958,0.000919,0.323221,0.000801,0.231868,3,10,12,14,1
2,7189384,3545715,3364458,26.367053,0.0,26.367053,26.36705,737,365.0,329.0,...,0.800475,0.000894,0.314628,0.000736,0.213146,1,1,7,17,0
3,7189384,3545715,3364458,119.060687,0.0,119.060687,119.060687,904,384.0,384.0,...,0.825778,0.000653,0.229821,0.000708,0.205018,16,10,17,17,1
4,7189384,3545715,3364458,169.927211,0.0,169.927211,148.74292,948,343.0,343.0,...,0.659846,0.000469,0.164863,0.000433,0.125296,4,7,11,14,1


## Separate Features from Target

In [7]:
y = walkability_df["NatWalkInd"]
X = walkability_df.drop(columns=["NatWalkInd"])

## Split data into training and testing

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

X_train.shape

(125448, 103)

## Create a logistic regression model

In [9]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(solver='liblinear', random_state=1)
classifier

LogisticRegression(random_state=1, solver='liblinear')

## Train(fit) the model using the training data

In [10]:
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

## Make predictions

In [11]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,0,0
3,0,1
4,0,0
5,0,0
6,1,1
7,1,1
8,1,1
9,1,1


## Validate the model

In [12]:
# Generate an accuracy score
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred))

0.930219767080374


In [13]:
# Generate a confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[18861  1334]
 [ 1584 20038]]


In [14]:
# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.92      0.93      0.93     20195
           1       0.94      0.93      0.93     21622

    accuracy                           0.93     41817
   macro avg       0.93      0.93      0.93     41817
weighted avg       0.93      0.93      0.93     41817

