**Load Libraries**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from __future__ import division
from sklearn import linear_model

**Load the data**

In [4]:
AllData = pd.read_csv('KBH_Stats_noDanes_noBydel_1.csv')

*Creating the training data as data before 2012 and the test data as data after 2011.
Also, get the all of the country numbers to be able to create regression model for each of the country and predict*

In [5]:
Data_Array = np.array([]).reshape(0,7)
TrainingData = AllData[(AllData['AAR'] <= 2011)]
TestData = AllData[(AllData['AAR'] > 2011)]
unqStatKode = TrainingData['STATKODE'].unique()

*Looping through each country and create regression and predict.*

In [6]:
regPerc = linear_model.LinearRegression()
reg = linear_model.LinearRegression()
predictYear = np.array([2012,2013,2014,2015])
for val,i in enumerate(unqStatKode):
    regPerc.fit(TrainingData[TrainingData['STATKODE'] == i][['AAR','Percentage']],
    TrainingData[TrainingData['STATKODE'] == i][['Total']])
    PredPerc = regPerc.predict(TestData[TestData['STATKODE'] == i][['AAR','Percentage']])
    reg.fit(TrainingData[TrainingData['STATKODE'] == i][['AAR']],
    TrainingData[TrainingData['STATKODE'] == i][['Total']])
    Pred = reg.predict(TestData[TestData['STATKODE'] == i][['AAR']])
    Country = TrainingData[TrainingData['STATKODE'] == i]['COUNTRY'].unique()
    realData = TestData[TestData['STATKODE'] == i]['Total'].as_matrix()

    for j in range(len(predictYear)):
        Data_Array = np.append(Data_Array,
                               [[Country[0],predictYear[j],
                                 PredPerc[j][0],Pred[j][0],
                                 realData[j],PredPerc[j][0]-realData[j],Pred[j][0]-realData[j]]],axis = 0)

*Set up the prediction data in DataFrame and saved to .csv file.*

In [7]:
Predictions = pd.DataFrame(Data_Array, columns = ['COUNTRY', 'AAR', 'Predictions_Perc','Predictions','Real','Error_Perc','Error'])
Predictions
Predictions.to_csv('KBH_Stats_Predictions.csv', sep=',', encoding='utf-8',index = False)

*Create list of Countries name and years*

In [8]:
CountryUnq = AllData['COUNTRY'].unique()
Years = AllData['AAR'].unique()

*Finding top 25 most populated nation in CPH and sum the rest into one variable as Other and save to .csv file*

In [9]:
TopData2015 = AllData[AllData['AAR'] == 2015].nlargest(25,'Total')
LowData2015 = AllData[AllData['AAR'] == 2015].nsmallest(len(CountryUnq)-25,'Total')
d = {'AAR':2015, 'STATKODE': 1, 'Total': LowData2015['Total'].sum(), 'Percentage': 0, 'COUNTRY': 'Other' }
df = pd.DataFrame(data=d,index=[1], columns = ['AAR','STATKODE','Total','Percentage','COUNTRY'])
TopData2015 = TopData2015.append(df)
TopData2015[['AAR','Total','COUNTRY']].to_csv('KBH_Top25.csv', sep=',', encoding='utf-8',index = False)

In [25]:
Top25 = pd.DataFrame(columns = ['AAR','STATKODE','Total','Percentage','COUNTRY'])
for val,i in enumerate(Years):
    TopDataYear = AllData[AllData['AAR'] == i].nlargest(25,'Total')
    Top25 = Top25.append(TopDataYear)

In [26]:
print Top25

         AAR  STATKODE   Total  Percentage         COUNTRY
30    1992.0    5172.0  5272.0    0.139486         Tyrkiet
20    1992.0    5152.0  4433.0    0.117288     Jugoslavien
146   1992.0    5472.0  3161.0    0.083633        Pakistan
29    1992.0    5170.0  2954.0    0.078156  Storbritannien
7     1992.0    5110.0  1881.0    0.049767           Norge
9     1992.0    5120.0  1780.0    0.047095         Sverige
53    1992.0    5244.0  1763.0    0.046645         Marokko
132   1992.0    5438.0  1349.0    0.035692            Iran
115   1992.0    5390.0  1092.0    0.028892             USA
32    1992.0    5180.0   957.0    0.025320        Tyskland
22    1992.0    5154.0   860.0    0.022754           Polen
14    1992.0    5130.0   769.0    0.020346        Frankrig
18    1992.0    5150.0   706.0    0.018679         Italien
4     1992.0    5106.0   652.0    0.017251          Island
1     1992.0    5103.0   593.0    0.015689        Statsløs
131   1992.0    5436.0   576.0    0.015240            Ir

*Creating the average of population of every nation and save to .csv file *

In [52]:
Data_Array = np.array([]).reshape(0,2)
for val,i in enumerate(CountryUnq):
    AvNum = AllData[AllData['COUNTRY'] == i]['Total'].sum()/len(Years)
    Data_Array = np.append(Data_Array,[[i,AvNum]], axis=0)
    
AvCountry = pd.DataFrame(Data_Array, columns = ['COUNTRY', 'AVERAGE'])
AvCountry['AVERAGE'] =AvCountry['AVERAGE'].astype('float64')
AvCountry = AvCountry.sort_values('AVERAGE', ascending = False)
AvCountry.to_csv('AverageCountry.csv', sep=',', encoding='utf-8',index = False)