In [3]:
# Import required libraries and dependencies
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# Load the data into a Pandas DataFrame
file_path = (r"C:\Users\jobel\OneDrive\Documents\UM\Project_4_Group_2\Project4Group2\pet_adoption_data.csv")
df_pet_adoption = pd.read_csv(file_path)

# Display sample data
df_pet_adoption.head(10)

Unnamed: 0,PetID,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood
0,500,Bird,Parakeet,131,Orange,Large,5.039768,1,0,27,140,0,0
1,501,Rabbit,Rabbit,73,White,Large,16.086727,0,0,8,235,0,0
2,502,Dog,Golden Retriever,136,Orange,Medium,2.076286,0,0,85,385,0,0
3,503,Bird,Parakeet,97,White,Small,3.339423,0,0,61,217,1,0
4,504,Rabbit,Rabbit,123,Gray,Large,20.4981,0,0,28,14,1,0
5,505,Dog,Labrador,70,Brown,Large,20.986261,0,0,87,301,1,0
6,506,Bird,Parakeet,169,Brown,Small,10.902613,1,0,70,440,1,0
7,507,Cat,Siamese,13,Orange,Large,7.252683,1,0,3,137,0,1
8,508,Bird,Parakeet,49,Brown,Medium,24.597598,1,1,69,405,0,0
9,509,Bird,Parakeet,60,Gray,Large,7.295994,0,0,73,231,1,0


In [5]:
df_pet_adoption.columns.values.tolist()

['PetID',
 'PetType',
 'Breed',
 'AgeMonths',
 'Color',
 'Size',
 'WeightKg',
 'Vaccinated',
 'HealthCondition',
 'TimeInShelterDays',
 'AdoptionFee',
 'PreviousOwner',
 'AdoptionLikelihood']

In [6]:
# Distribution plots
numerical_features = ['Age_in_Months', 'Weight_in_Kg', 'Days_in_Shelter', 'Adoption_Fee']
for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    plt.hist(df_pet_adoption[feature], bins=30, edgecolor='k', alpha=0.7)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()

KeyError: 'Age_in_Months'

<Figure size 1000x600 with 0 Axes>

In [None]:
# Distribution plots
categorical_features = ['Pet_Type', 'Breed', 'Color', 'Size', 'Vaccinated', 'Health_Condition', 'Previous_Owner']
for feature in categorical_features:
    plt.figure(figsize=(10, 6))
    df_pet_adoption[feature].value_counts().plot(kind='bar', alpha=0.7)
    plt.title(f'Count Plot of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.xticks(rotation=90)
    plt.show()

In [7]:
# imports
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [8]:
# Review the info
df_pet_adoption.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2007 entries, 0 to 2006
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   PetID               2007 non-null   int64  
 1   PetType             2007 non-null   object 
 2   Breed               2007 non-null   object 
 3   AgeMonths           2007 non-null   int64  
 4   Color               2007 non-null   object 
 5   Size                2007 non-null   object 
 6   WeightKg            2007 non-null   float64
 7   Vaccinated          2007 non-null   int64  
 8   HealthCondition     2007 non-null   int64  
 9   TimeInShelterDays   2007 non-null   int64  
 10  AdoptionFee         2007 non-null   int64  
 11  PreviousOwner       2007 non-null   int64  
 12  AdoptionLikelihood  2007 non-null   int64  
dtypes: float64(1), int64(8), object(4)
memory usage: 204.0+ KB


In [10]:
df_pet_adoption_renamed = df_pet_adoption.rename(columns={"PetID":"Pet_ID", "PetType":"Pet_Type","AgeMonths": "Age_in_Months", "WeightKg":"Weight_in_Kg", "HealthCondition":"Health_Condition",
                                                          "TimeInShelterDays":"Days_in_Shelter", "AdoptionFee":"Adoption_Fee", "PreviousOwner":"Previous_Owner",
                                                          "AdoptionLikelihood":"Adoption_Likelihood"})

In [12]:
# Convert categorical data to numeric with `pd.get_dummies`
df_pet_adoption_dummies = pd.get_dummies(df_pet_adoption_renamed, columns=['Pet_Type', 'Breed', 'Color', 'Size'])
# Display the transformed data
df_pet_adoption_dummies

Unnamed: 0,Pet_ID,Age_in_Months,Weight_in_Kg,Vaccinated,Health_Condition,Days_in_Shelter,Adoption_Fee,Previous_Owner,Adoption_Likelihood,Pet_Type_Bird,...,Breed_Rabbit,Breed_Siamese,Color_Black,Color_Brown,Color_Gray,Color_Orange,Color_White,Size_Large,Size_Medium,Size_Small
0,500,131,5.039768,1,0,27,140,0,0,True,...,False,False,False,False,False,True,False,True,False,False
1,501,73,16.086727,0,0,8,235,0,0,False,...,True,False,False,False,False,False,True,True,False,False
2,502,136,2.076286,0,0,85,385,0,0,False,...,False,False,False,False,False,True,False,False,True,False
3,503,97,3.339423,0,0,61,217,1,0,True,...,False,False,False,False,False,False,True,False,False,True
4,504,123,20.498100,0,0,28,14,1,0,False,...,True,False,False,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2002,2502,72,27.039045,1,0,66,26,1,1,False,...,False,False,False,False,False,True,False,False,False,True
2003,2503,124,4.726954,1,1,59,150,0,0,False,...,True,False,False,True,False,False,False,False,False,True
2004,2504,113,1.758592,1,0,68,302,0,0,False,...,True,False,False,False,False,True,False,False,False,True
2005,2505,12,20.961592,1,0,59,478,0,0,False,...,False,False,False,False,True,False,False,True,False,False


In [None]:
# Define features set
X = df_pet_adoption_dummies.copy()
X.drop("Adoption_Likelihood", axis=1, inplace=True)
X.head()

In [None]:
# Define target vector
y = df_pet_adoption_dummies["Adoption_Likelihood"].values.reshape(-1, 1)
y[:5]

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [None]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)

importances = model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# List the top 10 most important features
feature_importance_df.head(10)

In [15]:
#features = ['Pet_ID', 'Pet_Type', 'Breed', 'Age_in_Months', 'Color', 'Size', 'Weight_in_Kg', 'Vaccinated', 'Health_Condition', 'Days_in_Shelter', 'Adoption_Fee', 'Previous_Owner', 'Adoption_Likelihood']
X = df_pet_adoption_dummies
y = df_pet_adoption_dummies['Adoption_Likelihood']
train_X, val_X, train_y, val_y = train_test_split(X,y, random_state=1)

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
pet_model = LogisticRegression(max_iter=1000)
pet_model.fit(train_X,train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
predict = pet_model.predict(val_X)

In [21]:
accuracy = accuracy_score(val_y, predict)
print("Accuracy of Logistic Regression: " , round(accuracy*100,2), "%")

Accuracy of Logistic Regression:  100.0 %
