#### US Postal Service project ####

In [10]:
# Import required libraries

import pandas as pd
import numpy as np
import seaborn
from matplotlib import pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split

In [11]:
# Read in the data

df = pd.read_csv('parcel_data.csv')

print(df.head())

     weight sender_state  parcel_type
0       NaN           NC          1.0
1  2.600236           RI          1.0
2  3.468107           NY          1.0
3  5.361340           MN          1.0
4  4.801337           NV          1.0


In [12]:
# Check for missing values

print(f"Missing values\n{df.isna().sum()}")

Missing values
weight          59
sender_state     0
parcel_type      0
dtype: int64


In [13]:
# Impute missing values

df['weight'] = df['weight'].fillna(df['weight'].mean())

# Check is missing values have been eliminate
print(f"Missing values\n{df.isna().sum()}")

Missing values
weight          0
sender_state    0
parcel_type     0
dtype: int64


In [14]:
# Split into X and y

X = df.drop('parcel_type',axis=1)
y = df['parcel_type']

print(f"Shapes: {X.shape,y.shape}")

Shapes: ((1000, 2), (1000,))


In [15]:
# Conver categoric values to numeric encoding

X = pd.get_dummies(X)

print(f"Transformed X:\n{X.head()}")

Transformed X:
     weight  sender_state_AK  sender_state_AL  sender_state_AR  \
0  2.053350                0                0                0   
1  2.600236                0                0                0   
2  3.468107                0                0                0   
3  5.361340                0                0                0   
4  4.801337                0                0                0   

   sender_state_AZ  sender_state_CA  sender_state_CO  sender_state_CT  \
0                0                0                0                0   
1                0                0                0                0   
2                0                0                0                0   
3                0                0                0                0   
4                0                0                0                0   

   sender_state_DE  sender_state_FL  ...  sender_state_SD  sender_state_TN  \
0                0                0  ...                0              

In [16]:
# Train test split the data

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=25)

In [17]:
# Let us use the logisitc regression algorithm and fit a model
logreg = LogisticRegression()

logreg.fit(X_train,y_train)

# Predict on training data 
y_train_pred = logreg.predict(X_train)

# Training accuracy and confusion matrix
print(f"Training Accuracy: {accuracy_score(y_train_pred,y_train)*100}%")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_train_pred,y_train)}")

Training Accuracy: 97.22222222222221%

Confusion Matrix:
[[204  10]
 [ 15 671]]


In [18]:
# Check the performance on test data

# Predict on training data 
y_test_pred = logreg.predict(X_test)

# Training accuracy and confusion matrix
print(f"Training Accuracy: {accuracy_score(y_test_pred,y_test)*100}%")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_test_pred,y_test)}")

Training Accuracy: 98.0%

Confusion Matrix:
[[23  2]
 [ 0 75]]
