# 2. Gyakorlat: Logisztikus regresszió

In [2]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

## Adatok beolvasása

In [3]:
df = pd.read_csv('student_records.csv')
df.head()

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Henry,A,Y,90,85,Yes
1,John,C,N,85,51,Yes
2,David,F,N,10,17,No
3,Holmes,B,Y,75,71,No
4,Marvin,E,N,20,30,No


## Adatok előkészítése

In [4]:
training_features = df[['OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']].copy()

outcome_labels = df[['Recommend']].copy()

# Numerikus és kategorikus változók szétválogatása
numeric_feature_names = ['ResearchScore', 'ProjectScore']
categoricial_feature_names = ['OverallGrade', 'Obedient']

print("Tanító adatok:")
print(training_features.head())
print()
print("Címkék:")
print(outcome_labels.head())

Tanító adatok:
  OverallGrade Obedient  ResearchScore  ProjectScore
0            A        Y             90            85
1            C        N             85            51
2            F        N             10            17
3            B        Y             75            71
4            E        N             20            30

Címkék:
  Recommend
0       Yes
1       Yes
2        No
3        No
4        No


## Numerikus jellemzők szerkesztése

In [5]:
ss = StandardScaler() # StandardScaler objektum

# Illesszük az adatokra 
ss.fit(training_features[numeric_feature_names])

# Transzformáljuk az adatokat a scaler szerint
training_features[numeric_feature_names] = ss.transform(training_features[numeric_feature_names])

training_features.head()

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,0.899583,1.37665
1,C,N,0.730648,-0.091777
2,F,N,-1.80339,-1.560203
3,B,Y,0.392776,0.772004
4,E,N,-1.465519,-0.998746


## Kategorikus változók szerkesztése

In [6]:
training_features = pd.get_dummies(training_features, columns=categoricial_feature_names)

categorical_engineered_features = list(set(training_features.columns) - set(numeric_feature_names))

print(categorical_engineered_features)

['Obedient_Y', 'OverallGrade_C', 'OverallGrade_F', 'Obedient_N', 'OverallGrade_A', 'OverallGrade_E', 'OverallGrade_B']


## Modellezés

In [7]:
# Regresszor létrehozása
lr = LogisticRegression()

# Függvény illesztése 
model = lr.fit(training_features, np.array(outcome_labels['Recommend']))

## Modell értékelés

In [8]:
# egyszerű értékelés a training adatokon
pred_labels = model.predict(training_features)
actual_labels = np.array(outcome_labels['Recommend'])

print('Accuracy:', float(accuracy_score(actual_labels, pred_labels))*100, '%')
print('Classification Stats:')
print(classification_report(actual_labels, pred_labels))

Accuracy: 100.0 %
Classification Stats:
              precision    recall  f1-score   support

          No       1.00      1.00      1.00         5
         Yes       1.00      1.00      1.00         3

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



## Predikció élesben

In [9]:
# Új adatok beolvasása
new_data = pd.read_csv('new_data.csv', sep=';')
prediction_features = new_data[new_data.columns]

new_data.head()

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore
0,Nathan,F,N,30,20
1,Thomas,A,Y,78,80


## Új adatok átalakítása

In [10]:
# méretezés
prediction_features[numeric_feature_names] = ss.transform(prediction_features[numeric_feature_names])

# kategória változók
prediction_features = pd.get_dummies(prediction_features, columns=categoricial_feature_names)

# hiányzó kategória oszlopok hozzáadása
current_categorical_engineered_features = set(prediction_features.columns) - set(numeric_feature_names)
missing_features = set(categorical_engineered_features) - current_categorical_engineered_features

# nullák hozzáadása, mert az adathalmazban nem fordult elő minden jegyből
for feature in missing_features:    
    prediction_features[feature] = [0] * len(prediction_features)  

prediction_features.drop('Name', axis=1, inplace=True)

prediction_features.head()

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y,OverallGrade_B,OverallGrade_C,OverallGrade_E
0,-1.127647,-1.430636,False,True,True,False,0,0,0
1,0.494137,1.160705,True,False,False,True,0,0,0


## Predikció új adatokon a modellel

In [11]:
prediction_features = prediction_features[training_features.columns] # Oszlopok sorrendjének átrendezése

predictions = model.predict(prediction_features)

new_data['Recommend'] = predictions # Predikciók hozzárendelése az adathalmazhoz

new_data.head()

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Nathan,F,N,30,20,No
1,Thomas,A,Y,78,80,Yes
