In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import model_selection, linear_model, tree, ensemble, preprocessing, metrics
pd.set_option("display.max_columns", 1000)

#### Professor Rates

In [2]:
#load the data
train = pd.read_csv("rmp_train.csv")
test = pd.read_csv("rmp_test.csv")

train["textbookuse"] = [str(i) if type(i)!=str else i for i in train["textbookuse"]]
test["textbookuse"] = [str(i) if type(i)!=str else i for i in test["textbookuse"]]

train["comments"] = [str(i) if type(i)!=str else i for i in train["comments"]]
test["comments"] = [str(i) if type(i)!=str else i for i in test["comments"]]

train["interest"] = [str(i) if type(i)!=str else i for i in train["interest"]]
test["interest"] = [str(i) if type(i)!=str else i for i in test["interest"]]

In [119]:
#preprop

feats = ["dept", "helpcount", "profgender", "nothelpcount", "profhotness", "easiness", "textbookuse", "interest"] 
X_train = train[feats]#.drop("textbookuse", axis=1)
y_train = train["quality"]

X_test = test[feats]#.drop("textbookuse", axis=1)
y_test = test["quality"]


#binarize dept
lb = preprocessing.LabelBinarizer()
dept_bin_train = pd.DataFrame(lb.fit_transform(X_train["dept"])[:,:-1],
                    columns=["dept_%s"%(i) for i in range(len(X_train["dept"].unique())-1)])

dept_bin_test = pd.DataFrame(lb.transform(X_test["dept"])[:,:-1],
                    columns=["dept_%s"%(i) for i in range(len(X_train["dept"].unique())-1)])

#binarize textbookuse
lb_ = preprocessing.LabelBinarizer()
text_bin_train = pd.DataFrame(lb_.fit_transform(X_train["textbookuse"])[:,:],
                    columns=["book_%s"%(i) for i in range(len(X_train["textbookuse"].unique()))])


text_bin_test = pd.DataFrame(lb_.transform(X_test["textbookuse"])[:,:],
                    columns=["book_%s"%(i) for i in range(len(X_train["textbookuse"].unique()))])

#binarize interest
lb_ = preprocessing.LabelBinarizer()
interest_bin_train = pd.DataFrame(lb_.fit_transform(X_train["interest"])[:,:],
                    columns=["interest_%s"%(i) for i in range(len(X_train["interest"].unique()))])
interest_bin_test = pd.DataFrame(lb_.transform(X_test["interest"])[:,:],
                    columns=["interest_%s"%(i) for i in range(len(X_train["interest"].unique()))])



#final
X_train = pd.concat((X_train.drop(["dept", "textbookuse", "interest", "nothelpcount"], axis=1),
                     dept_bin_train,
                     text_bin_train,
                     interest_bin_train), axis=1) #
X_test = pd.concat((X_test.drop(["dept","textbookuse", "interest", "nothelpcount"], axis=1),
                    dept_bin_test,
                    text_bin_test,
                    interest_bin_test), axis=1) #

In [111]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(92024, 216) (92024,)
(25787, 216) (25787,)


In [121]:
linreg = linear_model.LinearRegression()
linreg.fit(X_train, y_train)

preds_train = linreg.predict(X_train)
preds_test = linreg.predict(X_test)

print(metrics.mean_squared_error(y_train, preds_train))
print(metrics.mean_squared_error(y_test, preds_test))

5.13253653095
4.32282770039


In [123]:
dectree = ensemble.RandomForestRegressor()
dectree.fit(X_train, y_train)

preds_train = dectree.predict(X_train)
preds_test = dectree.predict(X_test)

print(metrics.mean_squared_error(y_train, preds_train))
print(metrics.mean_squared_error(y_test, preds_test))

3.83688519578
4.8462469201


In [124]:
from sklearn.feature_extraction.text import CountVectorizer

In [125]:
#comments 
countvec = CountVectorizer(stop_words="english", min_df=80, ngram_range=(1,3))
cv_mat_train = countvec.fit_transform(train["comments"])
cv_mat_test = countvec.transform(test["comments"])

In [128]:
X_train_sparse = sp.sparse.hstack((sp.sparse.csr_matrix(np.array(X_train)), cv_mat_train))
X_test_sparse = sp.sparse.hstack((sp.sparse.csr_matrix(np.array(X_test)), cv_mat_test))

In [129]:
print(X_train_sparse.shape)
print(X_test_sparse.shape)

(92024, 3602)
(25787, 3602)


In [130]:
linreg = linear_model.LinearRegression()
linreg.fit(X_train_sparse, y_train)

preds_train = linreg.predict(X_train_sparse)
preds_test = linreg.predict(X_test_sparse)

print(metrics.mean_squared_error(y_train, preds_train))
print(metrics.mean_squared_error(y_test, preds_test))

2.88871964114
2.62784838935
