In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split
import seaborn as sns

# import multiple cities worth of data
data_1 = pd.read_csv('data/Austin_Final_2022-06-18.csv')
data_2 = pd.read_csv('data/Albuquerque_Final_2022-06-18.csv')
data_3 = pd.read_csv('data/StLouis_Final_2022-06-18.csv')
data_4 = pd.read_csv('data/WashingtonDC_Final_2022-06-18.csv')

# list of dataframes
data_list = [data_3]


In [6]:
# loop through the dataframes in data_list
for i in range(1, len(data_list) + 1):
    # get the dataframe
    data = globals()['data_' + str(i)]
    
    # one hot tree types
    tree_types  = pd.get_dummies(data['common_name'])

    # drop the common_name column
    data = data.drop(columns=['common_name'])

    # one hot condition column
    data['condition'] = data['condition'].replace({'excellent': 0, 'good': 1, 'fair': 2, 'poor': 3, 'dead/dying': 4, 'dead': 4})
    
    # create new dataframe with latitude and longitude, tree_types, native, and condition
    all_data = pd.concat([data[['longitude_coordinate', 'latitude_coordinate', 
                                'condition']],
                            tree_types],
                            axis=1)

    # drop rows the NaN values for condition
    all_data = all_data.dropna()

    # save the dataframe
    globals()['all_data_' + str(i)] = all_data

# concatenate all the dataframes
all_data = pd.concat([all_data_1, all_data_2, all_data_3, all_data_4], axis=0)


# # one hot tree types
# tree_types  = pd.get_dummies(data['common_name'])

# # convert condition to numerical
# condition = data['condition'].replace({'excellent': 0, 'good': 1, 'fair': 2, 'poor': 3, 'dead/dying': 4, 'dead': 4})

# # one hot native column
# native = pd.get_dummies(data['native'])

# # create new dataframe with latitude and longitude, tree_types, native, and condition
# all_data = pd.concat([data[['longitude_coordinate', 'latitude_coordinate']], 
#                       tree_types, 
#                       native, 
#                       condition], 
#                       axis=1)


all_data.head()



Unnamed: 0,longitude_coordinate,latitude_coordinate,condition,Afghan pine,American elm,American hornbeam,American persimmon,American sweet gum,American sycamore,Anacacho orchid tree,...,Summershade norway maple,Sunburst honeylocust,Sweetgum (sterile),Tulip poplar,Tupelo,Turkey oak,Village green japanese zelkova,White pine,Whitehouse callery pear,Yellow buckeye
0,-97.736905,30.273151,3.0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,-97.736577,30.27391,2.0,0.0,1,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,-97.736597,30.27336,3.0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,-97.736319,30.273816,3.0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,-97.736167,30.273598,2.0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [7]:
# split data back into x and y
x_data = all_data.drop(['condition'], axis=1)
y_data = all_data['condition']

# print unique values in y_data
print(y_data.unique())

[3.0 2.0 1.0 0.0 4.0 ' ']


In [38]:
# # standardize x_data
# scaler = StandardScaler()
# x_data = scaler.fit_transform(x_data)

In [8]:
# split into train and test
train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=25)

In [9]:
# split into x and y
x_train = train_data.drop(['condition'], axis=1)
y_train = train_data['condition']
x_test = test_data.drop(['condition'], axis=1)
y_test = test_data['condition']

x_train.head()

Unnamed: 0,longitude_coordinate,latitude_coordinate,Afghan pine,American elm,American hornbeam,American persimmon,American sweet gum,American sycamore,Anacacho orchid tree,Arizona ash,...,Summershade norway maple,Sunburst honeylocust,Sweetgum (sterile),Tulip poplar,Tupelo,Turkey oak,Village green japanese zelkova,White pine,Whitehouse callery pear,Yellow buckeye
684,-77.014634,38.876586,,0,,,,0.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42627,-90.256663,38.617732,,0,0.0,,,0.0,,,...,,,,,,,,,,
82749,-76.984187,38.901973,,0,,,,0.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136962,-76.99047,38.937954,,0,,,,0.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
142555,-76.979077,38.926463,,0,,,,0.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
# x_train = train_data.iloc
# y_train = train_data.iloc[:,-5:]
# x_test = test_data.iloc[:, 0:-5]
# y_test = test_data.iloc[:,-5:]

In [10]:
print(y_test)

23294     3.0
72365       1
166164      1
43009       1
6223        1
         ... 
6458      3.0
34875       1
14849     2.0
21581       2
144935      1
Name: condition, Length: 53561, dtype: object


In [11]:
# compute correlation matrix
corr_df = x_train
cor = corr_df.corr(method='pearson') 
# print(cor)

# Plot
fig_corr, ax_corr =plt.subplots()
plt.title("Correlation Plot")
sns.heatmap(cor, cmap=plt.cm.RdYlBu, square=True, ax=ax_corr)
plt.show()


In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# train logistic regression model on training data
logistic_regression = LogisticRegression()
logistic_regression.fit(x_train, y_train)

# predict on test data
y_pred = logistic_regression.predict(x_test)

# compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)



0.5525


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [58]:
# optimize logistic regression model
from sklearn.model_selection import GridSearchCV

# initialize logistic regression model parameters
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 100)   

# create empty accuracy list
accuracy = []

# loop through parameters
for p in penalty:
    for c in C:
        # initialize logistic regression model
        logistic_regression = LogisticRegression(penalty=p, C=c, solver='liblinear')
        
        # fit model
        logistic_regression.fit(x_train, y_train)
        
        # predict on test data
        y_pred = logistic_regression.predict(x_test)
        
        # compute accuracy
        accuracy.append(accuracy_score(y_test, y_pred))

# find max accuracy
max_accuracy = max(accuracy)
print(max_accuracy)

# find index of max accuracy
max_accuracy_index = accuracy.index(max_accuracy)
print(max_accuracy_index)

# find parameters that correspond to max accuracy
penalty = penalty[max_accuracy_index // len(C)]
C = C[max_accuracy_index % len(C)]

# plot accuracy
plt.plot(accuracy)
plt.show()








KeyboardInterrupt: 

In [46]:
# return optimal parameters
print(penalty)
print(C)

# initialize logistic regression model with optimal parameters
logistic_regression = LogisticRegression(penalty=penalty, C=C, solver='liblinear')

# fit model
logistic_regression.fit(x_train, y_train)

# predict on test data
y_pred = logistic_regression.predict(x_test)

# compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

l1
5214.00828799969
0.5203389830508475
