# Project Walk Through

## Get the data

In [None]:
import pandas as pd

In [None]:
# Get the Professor Ratings dataset
prof_df = pd.read_csv("professor_ratings.csv")

In [None]:
prof_df.head()

### Rename features

In [None]:
prof_df = prof_df.rename(index=str, columns= {"evaluation":"high_eval"})

In [None]:
prof_df.head()

### Drop unwanted features

In [None]:

# Drop unwanted features
prof_df = prof_df.drop("professor_id", axis=1)

In [None]:
prof_df.head()

### Feature Transformation

In [None]:
# Most algorithms require strings to be converted to numbers (you can use .map() to accomplish this).
# Transform the appropriate features from strings to a binary feature
prof_df["gender"] = prof_df["gender"].map({"male":0, "female": 1})
prof_df["minority"] = prof_df["minority"].map({"no":0, "yes": 1})
prof_df["native_english"] = prof_df["native_english"].map({"no":0, "yes": 1})
prof_df["lower_division"] = prof_df["lower_division"].map({"no":0, "yes": 1})
prof_df["attractive"] = prof_df["attractive"].map({"no":0, "yes": 1})
prof_df["high_eval"] = prof_df["high_eval"].map({"no":0, "yes": 1})

In [None]:
prof_df.head()

# Features

In [None]:
upper_attrprof = prof_df.loc[(prof_df["lower_division"] == 0) & (prof_df["attractive"] == 1), "high_eval"].mean()
upper_unattrprof = prof_df.loc[(prof_df["lower_division"] == 0) & (prof_df["attractive"] == 0), "high_eval"].mean()
upper_attrprof, upper_unattrprof

In [None]:
lower_attrprof = prof_df.loc[(prof_df["lower_division"] == 1) & (prof_df["attractive"] == 1), "high_eval"].mean()
lower_unattrprof = prof_df.loc[(prof_df["lower_division"] == 1) & (prof_df["attractive"] == 0), "high_eval"].mean()
lower_attrprof, lower_unattrprof

### Inferential Statistics

In [None]:
from scipy import stats
stats.ttest_ind

In [None]:
# You can apply inferential statistics
# For example, you can use a t-test to see if the difference in mean eval scores for attractive proffesors 
#              between the upper- and lower-division students is real or by random chance.

# get the eval scores for attractive professors for each group of students

upper_attr_evals = prof_df.loc[(prof_df["lower_division"] == 0) & (prof_df["attractive"] == 1), "eval"]
lower_attr_evals = prof_df.loc[(prof_df["lower_division"] == 1) & (prof_df["attractive"] == 1), "eval"]

In [None]:
upper_attr_evals.mean(), lower_attr_evals.mean()

In [None]:
# perform the t-test for differences between means
stats.ttest_ind(upper_attr_evals, lower_attr_evals)

### Feature Engineering

In [None]:
# create a new feature "beauty bias" to account for the undergraduate students' course evaluations
    # appearing to be influenced by the professor's attractiveness
prof_df["beauty_bias"] = prof_df["lower_division"] * prof_df["attractive"]

In [None]:
prof_df.head()

In [None]:
# reorder the features so that the target feature is last
prof_df = prof_df.iloc[:, [0,1,2,3,4,5,6,8,7]]

In [None]:
prof_df.head()

# Machine Learning

In [None]:
prof_df.head()

## Separate the data into X (features) and y (target)

In [None]:
# capital X usually indicates all of the features that the algorithm will be given to learn from
    # return only the values, not the DataFrame
    
X = prof_df.iloc[:,:-1].values

In [None]:
X[:5]

In [None]:
# lowercase y usually indicates the target variabke, the classifications that you would like to predict.
    # return only the values
    
y = prof_df.iloc[:,-1].values

In [None]:
y[:5]

## Split the data into train and test sets

In [None]:
# split dataset into test/train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)

In [None]:
X_train.shape, X_test.shape

## Create and evaluate some models

# CLASSIFICATION ALGORITHMS

### Logistic Regression

In [None]:
# binary classifier
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

log_reg = LogisticRegression(solver='lbfgs')
log_reg.fit(X_train, y_train)
log_reg.predict(X_test)

In [None]:
log_reg.score(X_test, y_test)

In [None]:
log_reg.predict_proba(X_test)[:5]

### Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm = SVC(gamma='auto')
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree.score(X_test, y_test)

### K-nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)