# Scikit-learn Introduction

## Get the data

In [1]:
import pandas as pd

# prevent warnings from displaying
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Get the Professor Ratings dataset
prof_df = pd.read_csv("professor_ratings.csv")

In [3]:
prof_df.head()

Unnamed: 0,professor_id,age,gender,minority,native_english,lower_division,attractive,eval,evaluation
0,1,36,female,yes,yes,no,yes,4.3,yes
1,2,59,male,no,yes,no,no,4.5,yes
2,3,51,male,no,yes,no,no,3.7,yes
3,4,40,female,no,yes,no,no,4.3,yes
4,5,31,female,no,yes,no,yes,4.4,yes


### Rename features

In [4]:
prof_df = prof_df.rename(index=str, columns= {"evaluation":"high_eval"})

In [5]:
prof_df.head()

Unnamed: 0,professor_id,age,gender,minority,native_english,lower_division,attractive,eval,high_eval
0,1,36,female,yes,yes,no,yes,4.3,yes
1,2,59,male,no,yes,no,no,4.5,yes
2,3,51,male,no,yes,no,no,3.7,yes
3,4,40,female,no,yes,no,no,4.3,yes
4,5,31,female,no,yes,no,yes,4.4,yes


### Drop unwanted features

In [6]:

# Drop unwanted features
prof_df = prof_df.drop("professor_id", axis=1)

In [7]:
prof_df.head()

Unnamed: 0,age,gender,minority,native_english,lower_division,attractive,eval,high_eval
0,36,female,yes,yes,no,yes,4.3,yes
1,59,male,no,yes,no,no,4.5,yes
2,51,male,no,yes,no,no,3.7,yes
3,40,female,no,yes,no,no,4.3,yes
4,31,female,no,yes,no,yes,4.4,yes


### Feature Transformation

In [8]:
# Most algorithms require strings to be converted to numbers (you can use .map() to accomplish this).
# Transform the appropriate features from strings to a binary feature
prof_df["gender"] = prof_df["gender"].map({"male":0, "female": 1})
prof_df["minority"] = prof_df["minority"].map({"no":0, "yes": 1})
prof_df["native_english"] = prof_df["native_english"].map({"no":0, "yes": 1})
prof_df["lower_division"] = prof_df["lower_division"].map({"no":0, "yes": 1})
prof_df["attractive"] = prof_df["attractive"].map({"no":0, "yes": 1})
prof_df["high_eval"] = prof_df["high_eval"].map({"no":0, "yes": 1})

In [9]:
prof_df.head()

Unnamed: 0,age,gender,minority,native_english,lower_division,attractive,eval,high_eval
0,36,1,1,1,0,1,4.3,1
1,59,0,0,1,0,0,4.5,1
2,51,0,0,1,0,0,3.7,1
3,40,1,0,1,0,0,4.3,1
4,31,1,0,1,0,1,4.4,1


### Feature Engineering

In [10]:
upper_attrprof = prof_df.loc[(prof_df["lower_division"] == 0) & (prof_df["attractive"] == 1), "high_eval"].mean()
upper_unattrprof = prof_df.loc[(prof_df["lower_division"] == 0) & (prof_df["attractive"] == 0), "high_eval"].mean()
upper_attrprof, upper_unattrprof

(0.8283582089552238, 0.8313953488372093)

In [11]:
lower_attrprof = prof_df.loc[(prof_df["lower_division"] == 1) & (prof_df["attractive"] == 1), "high_eval"].mean()
lower_unattrprof = prof_df.loc[(prof_df["lower_division"] == 1) & (prof_df["attractive"] == 0), "high_eval"].mean()
lower_attrprof, lower_unattrprof

(0.9056603773584906, 0.8076923076923077)

In [12]:
# create a new feature "beauty bias" to account for the undergraduate students' course evaluations
    # appearing to be influenced by the professor's attractiveness
prof_df["beauty_bias"] = prof_df["lower_division"] * prof_df["attractive"]

In [15]:
prof_df.head()

Unnamed: 0,age,gender,minority,native_english,lower_division,attractive,eval,high_eval,beauty_bias
0,36,1,1,1,0,1,4.3,1,0
1,59,0,0,1,0,0,4.5,1,0
2,51,0,0,1,0,0,3.7,1,0
3,40,1,0,1,0,0,4.3,1,0
4,31,1,0,1,0,1,4.4,1,0


In [16]:
# reorder the features so that the target feature is last
prof_df = prof_df.iloc[:, [0,1,2,3,4,5,6,8,7]]

In [17]:
prof_df.head()

Unnamed: 0,age,gender,minority,native_english,lower_division,attractive,eval,beauty_bias,high_eval
0,36,1,1,1,0,1,4.3,0,1
1,59,0,0,1,0,0,4.5,0,1
2,51,0,0,1,0,0,3.7,0,1
3,40,1,0,1,0,0,4.3,0,1
4,31,1,0,1,0,1,4.4,0,1


# Inferential Statistics

In [23]:
from scipy import stats
stats.ttest_ind

<function scipy.stats.stats.ttest_ind(a, b, axis=0, equal_var=True, nan_policy='propagate')>

In [18]:
# You can apply inferential statistics
# For example, you can use a t-test to see if the difference in mean eval scores for attractive proffesors 
#              between the upper- and lower-division students is real or by random chance.

# get the eval scores for attractive professors for each group of students

upper_attr_evals = prof_df.loc[(prof_df["lower_division"] == 0) & (prof_df["attractive"] == 1), "eval"]
lower_attr_evals = prof_df.loc[(prof_df["lower_division"] == 1) & (prof_df["attractive"] == 1), "eval"]

In [21]:
upper_attr_evals.mean(), lower_attr_evals.mean()

(3.950746268656715, 4.288679245283019)

In [24]:
# perform the t-test for differences between means
stats.ttest_ind(upper_attr_evals, lower_attr_evals)

Ttest_indResult(statistic=-3.913537577950357, pvalue=0.00012770919203930345)

# Machine Learning

In [25]:
prof_df.head()

Unnamed: 0,age,gender,minority,native_english,lower_division,attractive,eval,beauty_bias,high_eval
0,36,1,1,1,0,1,4.3,0,1
1,59,0,0,1,0,0,4.5,0,1
2,51,0,0,1,0,0,3.7,0,1
3,40,1,0,1,0,0,4.3,0,1
4,31,1,0,1,0,1,4.4,0,1


## Separate the data into X (features) and y (target)

In [26]:
# capital X usually indicates all of the features that the algorithm will be given to learn from
    # return only the values, not the DataFrame
    
X = prof_df.iloc[:,:-1].values

In [27]:
X[:5]

array([[36. ,  1. ,  1. ,  1. ,  0. ,  1. ,  4.3,  0. ],
       [59. ,  0. ,  0. ,  1. ,  0. ,  0. ,  4.5,  0. ],
       [51. ,  0. ,  0. ,  1. ,  0. ,  0. ,  3.7,  0. ],
       [40. ,  1. ,  0. ,  1. ,  0. ,  0. ,  4.3,  0. ],
       [31. ,  1. ,  0. ,  1. ,  0. ,  1. ,  4.4,  0. ]])

In [28]:
# lowercase y usually indicates the target variabke, the classifications that you would like to predict.
    # return only the values
    
y = prof_df.iloc[:,-1].values

In [29]:
y[:5]

array([1, 1, 1, 1, 1], dtype=int64)

## Split the data into train and test sets

In [30]:
# split dataset into test/train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)

In [31]:
X_train.shape, X_test.shape

((324, 8), (139, 8))

## Create and evaluate some models

# CLASSIFICATION ALGORITHMS

### Logistic Regression

In [32]:
# binary classifier
from sklearn.linear_model import LogisticRegression


log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_reg.predict(X_test)

array([1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [33]:
log_reg.score(X_test, y_test)

0.8920863309352518

In [34]:
log_reg.predict_proba(X_test)[:5]

array([[0.00218987, 0.99781013],
       [0.02292521, 0.97707479],
       [0.52490119, 0.47509881],
       [0.03281055, 0.96718945],
       [0.03192454, 0.96807546]])

### Support Vector Machine

In [35]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.8705035971223022

### Decision Tree

In [36]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree.score(X_test, y_test)

1.0

### K-nearest Neighbors

In [37]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.8345323741007195