In [3]:
import numpy
import pandas
import sklearn
import matplotlib.pyplot as pypl
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.cross_validation import KFold
from sklearn import cross_validation
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import KFold
from prettytable import PrettyTable
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.cross_validation import KFold
import matplotlib.pyplot as plt
import string

# function that calculates adjusted R2 value as defined in 
# https://sourceforge.net/p/scikit-learn/mailman/scikit-learn-general/thread/516EC479.6080704@gmail.com/
def adj_r2_score(model,y,yhat):
	adj = 1 - float(len(y)-1)/(len(y)-len(model.coef_)-1)*(1 - metrics.r2_score(y,yhat))
	return adj

# ===============================================================
# ================ HOLD.OUT (TEST/TRAIN) METHOD =================
# ===============================================================

# reading training data (year 2012 to 2014) and converting to data frame
train = pandas.read_csv("training.csv",encoding='utf-8')
train_df = pandas.DataFrame(train)
train_df.dropna(inplace = True)

# setting up the data for prediction
X = train_df[["teaching","international","research","citations","income"]]
y = train_df[["total_score"]]

r2_mul = []
adj_r2_mul = []
mse_mul = []

r2_svr = []
adj_r2_svr = []
mse_svr = []

for i in range(100):
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)

	# initiating the regression algorithms

	lreg = linear_model.LinearRegression()	# multiple linear regression model
	lreg.fit(X_train, y_train)
	y_pred = lreg.predict(X_test)
	# print y_pred
	r2 = r2_score(y_test, y_pred)
	adj_r2 = adj_r2_score(lreg, y_test, y_pred)
	mse = mean_squared_error(y_test, y_pred)
	r2_mul.append(r2)
	adj_r2_mul.append(adj_r2)
	mse_mul.append(mse)

	svr = SVR(kernel = 'linear')	# support vector regression (Linear kernel)
	svr.fit(X_train, numpy.ravel(y_train))
	y_pred = svr.predict(X_test)
	# print y_pred
	r2 = r2_score(y_test, y_pred)
	adj_r2 = adj_r2_score(svr, y_test, y_pred)
	mse = mean_squared_error(y_test, y_pred)
	r2_svr.append(r2)
	adj_r2_svr.append(adj_r2)
	mse_svr.append(mse)

t = PrettyTable(['model','r2','adj.r2','mse'])
t.add_row(['Multiple Regression', numpy.average(r2_mul), numpy.average(adj_r2_mul), numpy.average(mse_mul)])
t.add_row(['Support Vector Regression', numpy.average(r2_svr), numpy.average(adj_r2_svr), numpy.average(mse_svr)])

print ("===============================")
print (" Hold.out (test-train) method ")
print ("===============================")
print (t) 

# ===============================================================
# ==================== CROSS VALIDATION =========================
# ===============================================================

data = pandas.read_csv("training.csv",encoding='utf-8')
df = pandas.DataFrame(data)
df.dropna(inplace = True)

t1 = PrettyTable(['Multiple Regression','Fold','r2','adj.r2','mse'])
t2 = PrettyTable(['Support Vector Regression','Fold','r2','adj.r2','mse'])

X = df[["teaching","international","research","citations","income"]]
y = df[["total_score"]]

lreg = linear_model.LinearRegression()	# multiple linear regression model
svr = SVR(kernel = 'linear')	# support vector regression (Linear kernel)

kf = KFold(len(X), n_folds = 10)
count = 0

r2_mul = []
adj_r2_mul = []
mse_mul = []

r2_svr = []
adj_r2_svr = []
mse_svr = []

for train_index, test_index in kf:
	X_train, X_test = X[train_index[0] : train_index[-1]], X[test_index[0] : test_index[-1]]
	y_train, y_test = y[train_index[0] : train_index[-1]], y[test_index[0] : test_index[-1]]

	count += 1

	lreg.fit(X_train, y_train)
	y_pred = lreg.predict(X_test)
	# print y_pred
	r2 = r2_score(y_test, y_pred)
	adj_r2 = adj_r2_score(lreg, y_test, y_pred)
	mse = mean_squared_error(y_test, y_pred)
	r2_mul.append(r2)
	adj_r2_mul.append(adj_r2)
	mse_mul.append(mse)
	t1.add_row([' ', count, r2, adj_r2, mse])

	svr = SVR(kernel = 'linear')	# support vector regression (Linear kernel)
	svr.fit(X_train, numpy.ravel(y_train))
	y_pred = svr.predict(X_test)
	# print y_pred
	r2 = r2_score(y_test, y_pred)
	adj_r2_mul.append(adj_r2)
	mse = mean_squared_error(y_test, y_pred)
	r2_svr.append(r2)
	adj_r2_svr.append(adj_r2)
	mse_svr.append(mse)
	t2.add_row([' ', count, r2, adj_r2, mse])

t1.add_row(['average', "", numpy.average(r2_mul), numpy.average(adj_r2_mul), numpy.average(mse_mul)])
t2.add_row(['average', "", numpy.average(r2_svr), numpy.average(adj_r2_svr), numpy.average(mse_svr)])

print ("===============================")
print (" cross-validation method ")
print ("===============================")
print (t1)
print (t2)

# ===============================================================
# ======== Testing multiple regression model on test set ========
# ===============================================================

# reading training data (year 2012 to 2014) and converting to data frame
train = pandas.read_csv("training.csv",encoding='utf-8')
train_df = pandas.DataFrame(train)
train_df.dropna(inplace = True)

# reading test data (year 2015 and 2016 data) and converting to data frame
test = pandas.read_csv("testing.csv",encoding='utf-8')
test_df = pandas.DataFrame(test)
test_df.dropna(inplace = True)

# setting up the data for prediction
X_train = train_df[["teaching","international","research","citations","income"]]
y_train = train_df[["total_score"]]
X_test = test_df[["teaching","international","research","citations","income"]]
y_test = test_df[["total_score"]]

lreg = linear_model.LinearRegression()	# multiple linear regression model
lreg.fit(X_train, y_train)
y_pred = lreg.predict(X_test)
# print y_pred	# remove the comment to print prediction results
r2 = r2_score(y_test, y_pred)
adj_r2 = adj_r2_score(lreg, y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

t = PrettyTable(['Multiple regression (on test data)','r2','adj.r2','mse'])
t.add_row(['', r2, adj_r2, mse])

print ("===============================")
print (" Testing results ")
print ("===============================")
print (t)

# ===============================================================
# ======================== Graph plot ===========================
# ===============================================================

actual_scores = y_test
actual_scores = numpy.ravel(actual_scores)
actual_scores_2015 = actual_scores[0:50]	# get actual scores of 50 universities for test set for year 2015
actual_scores_2016 = actual_scores[188:(188+50)]	# get actual scores for test set for year 2016

test_scores = y_pred
test_scores = numpy.ravel(test_scores)
test_scores_2015 = test_scores[0:50]	# get predicted scores of 50 universities for year 2015
test_scores_2016 = test_scores[188:(188+50)]	# get predicted scores for year 2016

univ_names = test_df[["university_name"]]
univ_names = numpy.ravel(univ_names)
for i in range(len(univ_names)):
	temp = univ_names[i]
	univ_names[i] = temp[0:20]
univ_names_2015 = univ_names[0:50]
univ_names_2016 = univ_names[188:(188+50)]

# plotting 2015 university actual scores (oublished) vs. predicted scores
x_2015 = numpy.array(range(len(univ_names_2015)))
y_actual_2015 = actual_scores_2015
y_test_2015 = test_scores_2015
my_xticks_2015 = univ_names_2015

plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.2)
plt.xticks(x_2015, my_xticks_2015, rotation = 'vertical')
plt.tick_params(axis='x', which='major', labelsize=8)
plt.scatter(x_2015, y_actual_2015, marker = 'o', c = "skyblue", alpha = 0.3, s = 30, label = '2015 actual scores')
plt.plot(x_2015, y_test_2015, linestyle = '--', c = "red", alpha = 1, lw = 1.5, label = '2015 predicted scores')
plt.legend(loc = 'upper right')
plt.xlabel('university')
plt.ylabel('overall_score')
plt.title('2015 World University Rankings (Top 50)')
plt.tight_layout
plt.show()

# plotting 2015 university actual scores (oublished) vs. predicted scores
x_2016 = numpy.array(range(len(univ_names_2016)))
y_actual_2016 = actual_scores_2016
y_test_2016 = test_scores_2016
my_xticks_2016 = univ_names_2016

plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.2)
plt.xticks(x_2016, my_xticks_2016, rotation = 'vertical')
plt.tick_params(axis='x', which='major', labelsize=8)
plt.xticks(x_2016, my_xticks_2016, rotation = 'vertical')
plt.scatter(x_2016, y_actual_2016, marker = '^', c = "lightgrey", alpha = 0.3, s = 30, label = '2016 actual scores')
plt.plot(x_2016, y_test_2016, linestyle = '--', c = "deeppink", alpha = 1, lw = 1.5, label = '2016 predicted scores')
plt.legend(loc = 'upper right')
plt.xlabel('university')
plt.ylabel('overall_score')
plt.title('2016 World University Rankings (Top 50)')
plt.show()

# plotting top 50 universities of 2015 based on calculated scores
x_2015 = numpy.array(range(len(univ_names_2015)))
y_actual_2015 = actual_scores_2015
y_test_2015 = test_scores_2015
my_xticks_2015 = univ_names_2015

plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.2)
plt.xticks(x_2015, my_xticks_2015, rotation = 'vertical')
plt.tick_params(axis='x', which='major', labelsize=8)
plt.bar(x_2015, y_actual_2015, alpha = 0.3)
plt.legend(loc = 'upper right')
plt.xlabel('university')
plt.ylabel('overall_score')
plt.title('2015 Top 50 Universities (based on predicted overall score)')
plt.tight_layout
plt.show()

# plotting top 50 universities of 2016 based on calculated scores
x_2016 = numpy.array(range(len(univ_names_2016)))
y_actual_2016 = actual_scores_2016
y_test_2016 = test_scores_2016
my_xticks_2016 = univ_names_2016

plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.2)
plt.xticks(x_2016, my_xticks_2016, rotation = 'vertical')
plt.tick_params(axis='x', which='major', labelsize=8)
plt.bar(x_2016, y_actual_2016, alpha = 0.3)
plt.legend(loc = 'upper right')
plt.xlabel('university')
plt.ylabel('overall_score')
plt.title('2016 Top 50 Universities (based on predicted overall score)')
plt.tight_layout
plt.show()

"""
actual_scores = y_test
actual_scores = numpy.ravel(actual_scores)
actual_scores_2015 = actual_scores[0:188]	# get actual scores for test set for year 2015
actual_scores_2016 = actual_scores[188:len(actual_scores)]	# get actual scores for test set for year 2016
test_scores = y_pred
test_scores = numpy.ravel(test_scores)
test_scores_2015 = test_scores[0:188]	# get predicted scores for year 2015
test_scores_2016 = test_scores[188:len(test_scores)]	# get predicted scores for year 2016
univ_names = test_df[["university_name"]]
univ_names = numpy.ravel(univ_names)
univ_names_2015 = univ_names[0:188]
univ_names_2016 = univ_names[188:len(univ_names)]
x_2015 = numpy.array(range(len(univ_names_2015)))
y_actual_2015 = actual_scores_2015
y_test_2015 = test_scores_2015
plt.scatter(x_2015, y_actual_2015, marker = 'o', c = "skyblue", alpha = '0.2', s = 30, label = '2015 actual scores')
plt.plot(x_2015, y_test_2015, linestyle = '--', c = "red", alpha = 1, lw = 1.5, label = '2015 predicted scores')
plt.legend(loc = 'upper right')
plt.xlabel('university')
plt.ylabel('overall_score')
plt.title('2015 world university rankings')
plt.show()
x_2016 = numpy.array(range(len(univ_names_2016)))
y_actual_2016 = actual_scores_2016
y_test_2016 = test_scores_2016
plt.scatter(x_2016, y_actual_2016, marker = '^', c = "lightgrey", alpha = '0.2', s = 30, label = '2016 actual scores')
plt.plot(x_2016, y_test_2016, linestyle = '--', c = "deeppink", alpha = 1, lw = 1.5, label = '2016 predicted scores')
plt.legend(loc = 'upper right')
plt.xlabel('university')
plt.ylabel('overall_score')
plt.title('2016 world university rankings')
plt.show()
"""

"""
x_2016 = numpy.array(range(len(univ_names_2016)))
y_actual_2016 = actual_scores_2016
y_test_2016 = test_scores_2016
plt.scatter(x_2016, y_actual_2016, marker = 'o', c = "skyblue", alpha = '0.2', s = 30, label = '2016 actual scores')
plt.plot(x_2016, y_test_2016, linestyle = '--', c = "red", alpha = 1, lw = 1.5, label = '2016 predicted scores')
plt.legend(loc = 'upper right')
plt.xlabel('university')
plt.ylabel('overall_score')
plt.title('2015 world university rankings')
x_2016 = numpy.array(range(len(univ_names_2016)))
y_actual_2016 = actual_scores_2016
y_test_2016 = test_scores_2016
plt.plot(x_2016, y_actual_2016, c = "green")
plt.plot(x_2016, y_test_2016, linestyle = "--", c = "red")
x_2016 = numpy.array(range(len(actual_scores_2016)))
y_2016 = actual_scores_2016
my_xticks_2016 = univ_names_2016
#plt.xticks(x_2015, my_xticks_2015, rotation = 'vertical')
plt.plot(x_2015, y_2015, c = "red")
plt.plot(x_2016, y_2016, c = "green")
"""



ModuleNotFoundError: No module named 'prettytable'