In [3]:
import csv
import numpy as np
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVC

In [9]:
def findCommunities(filename):
    communities = []
    with open(filename, 'r') as csvfile:
        filereader = csv.reader(csvfile)
        j = 0
        for i in filereader:
            if j > 0:
                if i[1] not in communities:
                    communities.append(i[1])
            j+=1
    return communities

In [85]:
def extractData(filename,communities,addstate = False,addcommunity = False):
    features = []
    x=[]
    y=[]
    with open(filename, 'r') as csvfile:
        filereader = csv.reader(csvfile)
        j = 0
        for i in filereader:
            if j > 0:
                data = i[2:len(i)-1]
                if addstate:
                    state = [0.0]*56
                    state[int(i[0])-1] = 1.0
                    data = data + state
                    features = features + ["state"+str(i) for i in range(56)]
                if addcommunity:
                    tmp = [0.0]*len(communities)
                    tmp[communities.index(i[1])] = 1.0
                    data = data + tmp
                    features = features + communities
                vect = np.array(data)
                #vect = np.array(i[3:len(i)-1])
                x.append(vect.astype(np.float))
                y.append(float(i[-1]))
            else:
                features = [i[0]]+i[2:len(i)-1]
            j+=1
        y = np.array(y)
        x = np.array(x)
        
        return x,y,features

In [86]:
def meanSquaredError(true,predict):
    print("Mean squared error: %.4f"
      % np.mean((predict-true) ** 2))

def ridgeRegression(alpha = 1.0):
    return Ridge(alpha=alpha, fit_intercept=True, normalize=False, copy_X=True,
                 max_iter=None, tol=0.001, solver='auto', random_state=None)

In [87]:
def do_cross_validation(X, y,clf, n_folds=5):
    cv = KFold(len(y), n_folds)
    accuracies = []
    for train_ind, test_ind in cv: 
        clf.fit(X[train_ind], y[train_ind])
        predictions = clf.predict(X[test_ind])
        meanSquaredError(y[test_ind],predictions)

In [88]:
def testAlpha(alpha_list):
    for i in alpha_list:
        ridgeReg= ridgeRegression(alpha = i)
        ridgeReg.fit(X[:1000],Y[:1000])
        predictedY=ridgeReg.predict(X[1000:])
        meanSquaredError(predictedY,Y[1000:])

In [96]:
filename = 'C:\\Users\\ar1\\Documents\ML\\Project\\Crime Prediction Data(1)\\Crime Prediction Data\\communities-crime-clean.csv'
distintCommunities = findCommunities(filename)
X,Y,Features = extractData(filename,distintCommunities,addstate=True,addcommunity = True)

In [97]:
ridgeReg= ridgeRegression()
ridgeReg.fit(X[:1000],Y[:1000])
predictedY=ridgeReg.predict(X[1000:])

In [98]:
meanSquaredError(predictedY,Y[1000:])
do_cross_validation(X[:1000],Y[:1000],ridgeReg,10)

Mean squared error: 0.0172
Mean squared error: 0.0386
Mean squared error: 0.0244
Mean squared error: 0.0170
Mean squared error: 0.0191
Mean squared error: 0.0132
Mean squared error: 0.0312
Mean squared error: 0.0266
Mean squared error: 0.0259
Mean squared error: 0.0274
Mean squared error: 0.0122


Explaination : MSE under 10 fold CV
3-c-i) The MSE under 10 fold CV can be found above. The MSE under 10 fold CV, when plotted on a graph, turns out to have its highs and lows. 

In [101]:
alphaList = [10,1,0.1,.01,.001]
testAlpha(alphaList)

Mean squared error: 0.0166
Mean squared error: 0.0172
Mean squared error: 0.0194
Mean squared error: 0.0251
Mean squared error: 0.0291


Explaination : Best Alpha
2-b-iii) The best alpha is 10 with a score of 0.0166. This is the best score compared with the other alpha scores.

In [76]:
ridgeReg= ridgeRegression()
ridgeReg.fit(X,Y)
predictedY=ridgeReg.predict(X)
meanSquaredError(predictedY,Y)

Mean squared error: 0.0047


Explaination : MSE on the training set
3-c-ii) The MSE on the training set is 0.0047. The MSE here is really good and it means that I was able to find out a very good fit ( because, lower the MSE, the better the fit is ). The MSE on the training set for ridgeRegression is better than the MSE on the training set for Linear Regression. 

In [100]:

print ("Important  \n\n")

feature_indx = np.argsort(ridgeReg.coef_)[::-1]
for i in feature_indx[:10]:
    print (Features[i], ridgeReg.coef_[i])


print ("\n\n---------------------\n\n")

feature_indx = np.argsort(ridgeReg.coef_)
for i in feature_indx[:10]:
    print (Features[i], ridgeReg.coef_[i])

Irvinecity 0.366471328241
Greenvillecity 0.304151822622
WestCovinacity 0.272955031258
Northportcity 0.249991072666
Jonesborocity 0.241202621618
Plantationcity 0.231473526834
Coltoncity 0.222102608709
Prichardcity 0.220073174043
Atholtown 0.212138537445
Bentonvillecity 0.206723825867


---------------------


Sylacaugacity -0.206712082519
Eustiscity -0.193520397373
Selmacity -0.183195306494
Alpharettacity -0.173953640364
state26 -0.171897677954
Natchitochescity -0.157725946388
Lawrencevillecity -0.152652367071
Conwaycity -0.148420041155
CollegeParkcity -0.148220007261
Atlantacity -0.140725838344
