# Introduction to Scikit-learn

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn

In [2]:
from sklearn.model_selection import train_test_split

data_raw = pd.read_csv("ROC_data.csv", index_col=0)
data_raw.head()

  mask |= (ar1 == a)


Unnamed: 0,LS_type,DA_area,prob_inh,prob_dyn,slope_tan,elevation,curvature,aspect,wet_index,litho,lulc
3970,8.0,3.0,0.0156,0.0,0.951556,2180.698,4.515543,80.28688,3.450854,5.0,71.0
3971,8.0,3.0,0.0124,0.0,1.093501,2166.145,3.625163,52.44894,3.311813,5.0,71.0
3972,8.0,3.0,0.0222,0.0,0.69077,2140.36,-1.464627,40.48208,4.597811,5.0,71.0
3973,8.0,3.0,0.0304,0.596,0.407125,2127.395,-1.632731,29.35348,5.113264,5.0,31.0
3975,8.0,3.0,0.0222,0.0,0.349647,2114.717,0.455268,51.53106,4.885499,5.0,52.0


In [3]:
data = data_raw.copy()
LS_type_mapping = {8: "No landslides", 1: "Fall/Topple", 2: "Torrent",   3: "Avalanche", 4: "Slump/Creep", 5: "Sackung"}
DA_area_mapping = {1: "runout", 2: "source", 3: "other"}
lithology_mapping = {1: 'Unconsolidated Sediment', 2: 'Ultrabasic rock', 3: 'Weak Metamorphic Foliated',
                     4: 'Sedimentary Rock', 5: 'Hard Metamorphic', 6: 'Intrusive Igneous',
                     7: 'Volcanic/Extrusive Igneous'}
LULC_mapping = {71: "Herbaceous", 52: "Shrubland", 41: "Forest", 31: "Barren",  21: "Developed"}

data['LS_type'] = data_raw.LS_type.map(LS_type_mapping)
data['DA_area'] = data_raw.DA_area.map(DA_area_mapping)
data['litho'] = data_raw.litho.map(lithology_mapping)
data['lulc'] = data_raw.lulc.map(LULC_mapping)

In [4]:
data.head()

Unnamed: 0,LS_type,DA_area,prob_inh,prob_dyn,slope_tan,elevation,curvature,aspect,wet_index,litho,lulc
3970,No landslides,other,0.0156,0.0,0.951556,2180.698,4.515543,80.28688,3.450854,Hard Metamorphic,Herbaceous
3971,No landslides,other,0.0124,0.0,1.093501,2166.145,3.625163,52.44894,3.311813,Hard Metamorphic,Herbaceous
3972,No landslides,other,0.0222,0.0,0.69077,2140.36,-1.464627,40.48208,4.597811,Hard Metamorphic,Herbaceous
3973,No landslides,other,0.0304,0.596,0.407125,2127.395,-1.632731,29.35348,5.113264,Hard Metamorphic,Barren
3975,No landslides,other,0.0222,0.0,0.349647,2114.717,0.455268,51.53106,4.885499,Hard Metamorphic,Shrubland


In [None]:
data.shape

In [6]:
data.LS_type.value_counts()

No landslides    2333568
Avalanche          44094
Fall/Topple        30908
Torrent             1721
Slump/Creep          248
Sackung               13
Name: LS_type, dtype: int64

In [9]:
continuous = data.dtypes == float
continuous

LS_type      False
DA_area      False
prob_inh      True
prob_dyn      True
slope_tan     True
elevation     True
curvature     True
aspect        True
wet_index     True
litho        False
lulc         False
dtype: bool

In [None]:
import seaborn as sns
sns.pairplot(data, hue='LS_type')

In [None]:
X = data.drop('LS_type', axis=1)
y = data['LS_type']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0)

In [None]:
X_train.shape

In [None]:
y_train.value_counts()

In [None]:
pd.Series(y_train).value_counts() / len(y_train)

Really Simple API
-------------------
0) Import your model class

In [None]:
from sklearn.linear_model import LogisticRegression

1) Instantiate an object and set the parameters

In [None]:
logreg = LogisticRegression()

2) Fit the model

In [None]:
logreg.fit(X_train, y_train)

3) Apply / evaluate

In [None]:
print(logreg.predict(X_train[:10]))

In [None]:
logreg.score(X_train, y_train)

In [None]:
logreg.score(X_test, y_test)

And again
---------

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=1000)

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf.score(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

# Uploading data to OpenML

In [None]:
import openml