## This notebook trains on the full training data set and generates predictions for the unlabeled test set. Writes predictions to a csv file that is able to be evaluated by Kaggle.

In [1]:
import sklearn as sk
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression

# Set the randomizer seed so results are the same each time
np.random.seed(0)

In [2]:
# import training data from relative filepath
data = pd.read_csv("../../data/train.csv")

# extract training data except labels and ID column
train_df = data.loc[:, (data.columns != "Cover_Type") & (data.columns != "Id")]

# extract labels from training data
train_labels_df = data.loc[:, "Cover_Type"]

train_df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,0
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,0
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,0
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,0
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# import test data from relative filepath
test_data = pd.read_csv("../../data/test.csv")

# extract test data except ID column
test_df = test_data.loc[:, test_data.columns != "Id"]

test_df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,2680,354,14,0,0,2684,196,214,156,6645,...,0,0,0,0,0,0,0,0,0,0
1,2683,0,13,0,0,2654,201,216,152,6675,...,0,0,0,0,0,0,0,0,0,0
2,2713,16,15,0,0,2980,206,208,137,6344,...,0,0,0,0,0,0,0,0,0,0
3,2709,24,17,0,0,2950,208,201,125,6374,...,0,0,0,0,0,0,0,0,0,0
4,2706,29,19,0,0,2920,210,195,115,6404,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# number and structure of hidden layers
hidden_layer_sizes = (100)

# build classifier
classifier = classifier = LogisticRegression(C=10, penalty = 'l1', solver="liblinear", multi_class="auto")

# fit classifier to training data
classifier.fit(train_df, train_labels_df)

# make predictions of test data
predictions = classifier.predict(test_df)



In [7]:
# converts predictions from np array to pd dataframe
predictions_df = pd.DataFrame(data = predictions, index = test_data.loc[:, "Id"], columns = ["Cover_Type"])

predictions_df

Unnamed: 0_level_0,Cover_Type
Id,Unnamed: 1_level_1
15121,2
15122,2
15123,2
15124,2
15125,2
...,...
581008,3
581009,3
581010,3
581011,3


In [8]:
# outputs to csv file
predictions_df.to_csv("lr_predictions.csv")