# Scikit-learn Introduction

## Get the data

In [None]:
import pandas as pd

In [None]:
# Get the UCLA Admissions dataset
ucla_admit_df = pd.read_csv("ucla_admit.csv")

# Drop unwanted features
ucla_admit_df = ucla_admit_df.drop("Serial No.", axis=1)


In [None]:
ucla_admit_df.head()

### Check Correlated Features

In [None]:
ucla_admit_df.corr()

In [None]:
import seaborn as sns

# hides Jupyter warnings
import warnings
warnings.filterwarnings('ignore')

pair_plot = sns.pairplot(ucla_admit_df[["Admitted", "GRE Score", "GPA"]], hue='Admitted')

### Feature Engineering and Transformation

In [None]:
# Most algorithms require strings to be converted to numbers (you can use .map() to accomplish this).
# Transform "Admitted" from strings to a binary feature
ucla_admit_df["Admitted"] = ucla_admit_df["Admitted"].map({"No":0, "Yes": 1})

In [None]:
# combine "GRE Score" and "GPA" into a new feature, "Performance Score"

ucla_admit_df["Performance Score"] = ucla_admit_df["GRE Score"]+(ucla_admit_df["GPA"]*100)

ucla_admit_df.head()

In [None]:
# You could potentially drop "GRE Score" and "GPA" as they are favtored into "Performance Score"

#ucla_admit_df = ucla_admit_df.drop(["GRE Score", "GPA"], axis=1)
#ucla_admit_df.head()

In [None]:
# You can create an adjusted performance score, "Adj. Performance Score" 
#         to account for the level of University they're applying to.
ucla_admit_df["Adj. Performance Score"] = ucla_admit_df["Performance Score"]-(ucla_admit_df["University Rating"]*10)

ucla_admit_df.head()

# Inferential Statistics

In [None]:
from scipy import stats
stats.ttest_ind

In [None]:
# You can apply inferential statistics
# For example, you can use a t-test to see if the difference in mean GRE scores 
#              between the admitted and not admitted groups is real or by random chance.

# get the GRE Scores of each group
adm_mean_GRE = ucla_admit_df.loc[ucla_admit_df["Admitted"] == 1, "GRE Score"]
not_mean_GRE = ucla_admit_df.loc[ucla_admit_df["Admitted"] == 0, "GRE Score"]

In [None]:
# perform the t-test for differences between means
stats.ttest_ind(adm_mean_GRE, not_mean_GRE)

In [None]:
adm_mean_GRE.mean(), not_mean_GRE.mean()

# Machine Learning

In [None]:
ucla_admit_df.head()

In [None]:
# Move the Target variable "Admitted" to be the last variable
ucla_admit_df = ucla_admit_df.iloc[:, [0,1,2,3,4,5,6,8,7]]
ucla_admit_df.head()

## Separate the data into X (features) and y (target)

In [None]:
# return only the values, not the DataFrame
   # capital X usually indicates all of the features that the algorithm will be given to learn from
    
X = ucla_admit_df.iloc[:,:-1].values

In [None]:
X[:5]

In [None]:
# lowercase y usually indicates the classifications that you would like to predict.
    # Most algorithms require strings to be converted to numbers (you can use .map() to accomplish this).
    
y = ucla_admit_df.iloc[:,-1].values

In [None]:
y

## Split the data into train and test sets

In [None]:
# split dataset into test/train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)

In [None]:
X_train.shape, X_test.shape

In [None]:
# standardize features using scikit-learn
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()

X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [None]:
X_train[:5]

In [None]:
X_train_std[:5]

## Create and evaluate some models

# CLASSIFICATION ALGORITHMS

### Logistic Regression

In [None]:
# binary classifier
from sklearn.linear_model import LogisticRegression


log_reg = LogisticRegression()
log_reg.fit(X_train_std, y_train)
log_reg.predict(X_test_std)

In [None]:
log_reg.score(X_test_std, y_test)

In [None]:
log_reg.predict_proba(X_test_std)

### Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train_std, y_train)
svm.score(X_test_std, y_test)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train_std, y_train)
tree.score(X_test_std, y_test)

### K-nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_std, y_train)
knn.score(X_test_std, y_test)