In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

%matplotlib inline

In [54]:
df = pd.read_csv("forestfires.csv")
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [55]:
print(df['area'].dtype) #float64

#variant 1
#df['area'] = np.where(df['area']>0, 1, 0)

#variant 2
df['wasFire'] = np.where(df['area']>0.0, 1, 0)


fireInfo = df[df.wasFire == 1]
print(fireInfo.shape)
fireInfo.month.value_counts()
#прямоі залежності між днями немає але місяцями - є

float64
(270, 14)


aug    99
sep    97
mar    19
jul    18
feb    10
dec     9
jun     8
oct     5
apr     4
may     1
Name: month, dtype: int64

In [56]:
def load_datasets(filename, data_columns):
    df = pd.read_csv(filename, dtype={'month':str})
    
    df['wasFire'] = np.where(df['area']>0.0, 1, 0)
    
    d = {'jan':1, 'feb':2, 'mar':3, 'apr':4,'may':5, 'jun':6, 'jul':7, 
                        'aug':8,'sep':9, 'oct':10, 'nov':11, 'dec':12 }
    
    df.month = df.month.map(d)
    
    X, y = df[data_columns], df['wasFire']
    return X, y

In [57]:
def split_datasets(X, y, test_size = 0.2):
 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    return X_train, X_test, y_train, y_test

In [58]:
def normalize(X):
    mean = np.mean(X)
    std = np.max(X) - np.min(X)
    X_new = (X - mean) / std
    return X_new, mean, std

In [59]:
def prepare_X(X):
    m = X.shape[0]
    ones = np.ones((m, 1))
    X_new = np.column_stack((ones, X))
    return X_new

In [60]:
def sigmoid(z):
    g =  1 / ( 1 + np.exp ( -z ) )
    return g

In [61]:
def h( X, theta ):
    z = np.dot( X, theta )
    return sigmoid( z )

In [62]:
def cost_function(X, y, theta):
    m = X.shape[0]
    if m == 0:
        return None
    _h = h( X, theta)
    J =  ( -y * np.log( _h ) - ( 1 - y ) * np.log( 1 - _h )).mean()
    return J

In [63]:
def derivative_theta(X, y, theta):
    m = X.shape[0]
    if m == 0:
        return None

    d_theta = np.dot ( X.T , ( h( X , theta ) - y)) / m
    
    return d_theta

In [64]:
def gradient_descent(X, y, theta, alpha, epsilon, num_iters, print_J = True):
    m = X.shape[0]
    J_history = []
    
    J = cost_function(X,y,theta)
    
    if print_J == True:
        print(J)
    J_history.append(J)
    for i in range(num_iters):
        
        
        theta = theta - alpha * derivative_theta (X, y, theta )
        
        J = cost_function(X,y,theta)
        
        J_history.append(J)
        
        if i % 1000 == 0 and print_J == True:
            #print(J)
            print("{}-th iteration: {}".format(i, J))
        
        if abs(J-J_history[-2]) < epsilon:
            break
            
    return theta, J_history

In [65]:
def predict(X, mean, std, theta):
    X_new = ( X - np.array( mean ) ) / np.array( std )
    X_new = prepare_X( X_new )
    predictions = h(X_new,theta)
    predictions = ( predictions >= 0.5) . astype(int)
    return predictions

In [66]:
data_columns = ["X","Y","month","FFMC","DMC","DC","ISI","temp","RH","wind","rain"]
#target_column = "area" вже не потрібна нам ця змінна
X, y = load_datasets('forestfires.csv', data_columns)
print('Training set: X={}, y={}'.format(X.shape, y.shape))
X

Training set: X=(517, 11), y=(517,)


Unnamed: 0,X,Y,month,FFMC,DMC,DC,ISI,temp,RH,wind,rain
0,7,5,3,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0
1,7,4,10,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0
2,7,4,10,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0
3,8,6,3,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2
4,8,6,3,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0
5,8,6,8,92.3,85.3,488.0,14.7,22.2,29,5.4,0.0
6,8,6,8,92.3,88.9,495.6,8.5,24.1,27,3.1,0.0
7,8,6,8,91.5,145.4,608.2,10.7,8.0,86,2.2,0.0
8,8,6,9,91.0,129.5,692.6,7.0,13.1,63,5.4,0.0
9,7,5,9,92.5,88.0,698.6,7.1,22.8,40,4.0,0.0


In [67]:
X_train, X_test, y_train, y_test = split_datasets(X, y, 0.1)
print('Training set: X={}, y={}'.format(X_train.shape, y_train.shape))
print('Test set: X={}, y={}'.format(X_test.shape, y_test.shape))

Training set: X=(465, 11), y=(465,)
Test set: X=(52, 11), y=(52,)


In [68]:
X_train_norn,mean,std = normalize(X_train)
X_new = prepare_X(X_train_norn)
y_new = y_train.values.reshape(X_new.shape[0], 1)
print('Test set: X={}, y={}'.format(X_new.shape, y_new.shape))

Test set: X=(465, 12), y=(465, 1)


In [69]:
theta = np.zeros((X_new.shape[1], 1))

In [70]:
cost_function(X_new, y_new, theta)

0.6931471805599452

In [71]:
new_theta, Js = gradient_descent(X_new, y_new, theta, 0.5, 1e-7, 1000000, True)
print(new_theta, len(Js))

0.6931471805599452
0-th iteration: 0.6925126714369126
1000-th iteration: 0.6746104350535715
2000-th iteration: 0.6736808926009222
3000-th iteration: 0.673251583530871
4000-th iteration: 0.6730322633661892
[[ 0.10015475]
 [ 0.45317012]
 [ 0.48572798]
 [ 1.91613635]
 [ 1.39192127]
 [ 0.3204445 ]
 [-0.62207844]
 [-1.39284084]
 [ 0.27846914]
 [-0.09772033]
 [ 0.72753295]
 [ 0.47677805]] 4810


In [72]:
cost_function(X_new, y_new, new_theta)

0.6729299624984083

In [73]:
y_pred = predict(X_test,mean, std, new_theta)
accuracy_score(y_test,y_pred)

0.5384615384615384

In [74]:
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)
predictions = logisticRegr.predict(X_test)
accuracy_score(y_test,predictions)

0.5192307692307693

In [75]:
print(confusion_matrix(y_test,y_pred))

[[ 9 17]
 [ 7 19]]


In [76]:
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)
predictions = logisticRegr.predict(X_test)
accuracy_score(y_test,predictions)

0.5192307692307693

In [77]:
recall_score(y_test, y_pred, average='binary')  
#The best value is 1 and the worst value is 0.
#співвідношенняt  tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. 

0.7307692307692307

In [78]:
precision_score(y_test, y_pred, average='binary')  

0.5277777777777778