In [15]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [85]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, auc

In [17]:
data = pd.read_csv("../input/school-data/data.csv")
print(data.shape)
data.head()

### Data Analysis and Feature Engineering

In [18]:
# checking for null values

data.isnull().sum().to_frame()

In [41]:
numerical_variables = [
    "parent_age", "parent_salary", "house_area", "average_grades"
]

categorical_variables = [
    'type_school', 'school_accreditation', 'gender', 'interest', 'residence',
    'parent_was_in_college'
]

In [19]:
# checking whether data is imabalanced 
data.in_college.value_counts()

### Checking for Outliers

In [20]:
data.boxplot("parent_age",figsize=(5,5))

In [21]:
data.boxplot("parent_salary",figsize=(5,5))

In [22]:
data.boxplot("house_area",figsize=(5,5))

In [23]:
data.boxplot("average_grades",figsize=(5,5))

### Outliers are present, So writing a function to find and remove it using IQR

In [38]:
def iqr_outliers(col):
    
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1
    
    lower_band = q1 - 1.5*iqr
    upper_band = q3 + 1.5*iqr
    
    return lower_band, upper_band


In [82]:

%%time
# Lower_band & Upper_band values of Numerical Variables

print("Lower_band & Upper_band values of Numerical Variables \n __________________________________________________________ \n")

for i in numerical_variables:
    
    print(i, ":  ", iqr_outliers(i))
    print(" ")

In [56]:
data.describe()

In [26]:
# checking for unique categories in categorical variables

for col in categorical_variables:
    print(col, ": ", "\n \n",data[col].value_counts(), "\n", "================================")

In [27]:
# checking whether data is imbalanced 

data.in_college.value_counts()

In [92]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [93]:
X.head()

In [94]:
%%time
# Applying Label Encoder to categorical data

encoder = LabelEncoder()

for col in categorical_variables:
    
    X[col] = encoder.fit_transform(X[col])

X.head()

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Building machine learning model

In [102]:
log_reg = LogisticRegression()
svm = SVC()

models = {"Logistic Regression":log_reg, "Support Vector Machine":svm}


for model_name,model in models.items():
    
    
    print(model_name, "\n ==================== \n")
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    auc_score = auc(y_test, y_pred)
    
    print(
        " Accuracy of {} is {}".format(model_name, accuracy), "\n", "Accuracy of {} is {}".format(model_name, auc_score), "\n "
    )

## The Results are:

 **(1) Logistic Regression:**

  - Accuracy of Logistic Regression is 0.475 
  - Accuracy of Logistic Regression is 59.5 
 
 
 **(2) Support Vector Machine**
  

  - Accuracy of Support Vector Machine is 0.65 
  - Accuracy of Support Vector Machine is 50.0 
 
 
 #### **Notes:**
 
  - Here I have not removed the outliers(Need to remove it and train again)
  - Used Label Encoder to handle categorical data(Can use better encoding techniques)
  - Used simple classifiers - Logistic Regression and Support Vector Machine