### Reading Data into a Pandas DataFrame for Data-Swimming

In [104]:
#Imports
#====================
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#====================

data = pd.read_csv('final_train_data.csv')

data.head()

Unnamed: 0.1,Unnamed: 0,Country Name,Country Code,Year,Balance,Inflation,GDP,Exports,Trade
0,0,Senegal,SEN,1983,-304382900.0,9.274213,-5.326393,618000000.0,59.224298
1,1,Togo,TGO,2009,-176669200.0,1.855275,3.510297,903026000.0,76.242542
2,2,Pakistan,PAK,1992,-1876092000.0,10.057085,7.705898,7351000000.0,34.48944
3,3,Nepal,NPL,1988,-271450600.0,11.81531,7.696809,190000000.0,24.978422
4,5,Malta,MLT,2016,726913200.0,1.604548,5.516617,3017299000.0,83.781471


### Finding High-Level Stats 
#### Num of Samples, Samples Per Country, etc

In [74]:
print("Total rows: ",len(data))
print("Total Countries: ",len(set(data['Country Code'])))

country_vc = data['Country Code'].value_counts()
country_to_sample_count = dict()
for _ in country_vc.iteritems():
    country_to_sample_count[_[0]] = _[1]

#Countries with Max samples:
print("=============")
print("Samples per Country(Top-10)")
sample_count = sorted([(k,v) for k,v in country_to_sample_count.items()],reverse=True,key=lambda x:x[1])
print(sample_count[:10])

Total rows:  4506
Total Countries:  150
Samples per Country(Top-10)
[('ISR', 47), ('ZAF', 45), ('CAN', 43), ('DOM', 39), ('MYS', 39), ('SAU', 39), ('GTM', 38), ('USA', 38), ('HTI', 37), ('GRD', 37)]


### Checking for NaN values in Dataset

In [105]:
print("Checking for NaN values in Dataset")
data.isnull().any()

Checking for NaN values in Dataset


Unnamed: 0      False
Country Name    False
Country Code    False
Year            False
Balance         False
Inflation        True
GDP              True
Exports          True
Trade            True
dtype: bool

#### Countries and Year for which Balance is present in the Dataset

In [98]:
country_to_year_series = dict()
for idx,row in data.iterrows():
    try:
        country_to_year_series[row['Country Code'].strip()].append(int(row['Year']))
    except KeyError:
        country_to_year_series[row['Country Code'].strip()] = [int(row['Year'])]
per_year_data = sorted([(k,sorted(v)) for k,v in country_to_year_series.items()],reverse=True,key=lambda x:len(x[1]))
print("Country and Years:")
print("Top-3")
for _ in per_year_data[:3]:
    print(_,end="\n===========\n")
print("Bottom-3")
for _ in per_year_data[-3:]:
    print(_,end="\n===========\n")

Country and Years:
Top-3
('ISR', [1960, 1961, 1962, 1964, 1965, 1966, 1968, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1980, 1981, 1982, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1995, 1996, 1997, 1998, 2000, 2001, 2002, 2003, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2015])
('ZAF', [1960, 1961, 1962, 1963, 1964, 1967, 1968, 1970, 1971, 1972, 1973, 1974, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1986, 1987, 1989, 1990, 1992, 1993, 1994, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2004, 2005, 2006, 2007, 2009, 2010, 2012, 2013, 2014, 2016])
('CAN', [1962, 1963, 1964, 1965, 1967, 1968, 1969, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1980, 1981, 1982, 1985, 1987, 1988, 1989, 1991, 1992, 1993, 1994, 1995, 1996, 1998, 2000, 2001, 2002, 2005, 2006, 2007, 2008, 2009, 2011, 2012, 2013, 2014, 2015, 2016])
Bottom-3
('JPN', [1996, 1997, 1998, 1999, 2000, 2002, 2004, 2006, 2007, 2009, 2010, 2011, 2012, 2013, 2014, 2016])
('AZE', [1995, 1996,

### Plotting Balance Vs Year for Top-3 Countries

In [97]:
import plotly.plotly as py
import plotly.graph_objs as go

pdf = data[data['Country Code'].isin(['ISR','ZAF','CAN'])]
trace1 = go.Bar(
    x=pdf[pdf['Country Code']=='ISR']['Year'],
    y=pdf[pdf['Country Code']=='ISR']['Balance'],
    name='ISR',
    marker=dict(
                color='rgb(255,0,0)')
)
trace2 = go.Bar(
    x=pdf[pdf['Country Code']=='ZAF']['Year'],
    y=pdf[pdf['Country Code']=='ZAF']['Balance'],
    name='ZAF',
    marker=dict(
                color='rgb(0,255,0)')
)
trace3 = go.Bar(
    x=pdf[pdf['Country Code']=='CAN']['Year'],
    y=pdf[pdf['Country Code']=='CAN']['Balance'],
    name='CAN',
    marker=dict(
                color='rgb(0,0,255)')
)

pdata = [trace1, trace2, trace3]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=pdata, layout=layout)
py.iplot(fig, filename='grouped-bar')

#### Plotting Balance Vs Year for Bottom-3 Countries

In [99]:
import plotly.plotly as py
import plotly.graph_objs as go

pdf = data[data['Country Code'].isin(['JPN','AZE','KAZ'])]
trace1 = go.Bar(
    x=pdf[pdf['Country Code']=='JPN']['Year'],
    y=pdf[pdf['Country Code']=='JPN']['Balance'],
    name='JPN',
    marker=dict(
                color='rgb(255,0,0)')
)
trace2 = go.Bar(
    x=pdf[pdf['Country Code']=='AZE']['Year'],
    y=pdf[pdf['Country Code']=='AZE']['Balance'],
    name='AZE',
    marker=dict(
                color='rgb(0,255,0)')
)
trace3 = go.Bar(
    x=pdf[pdf['Country Code']=='KAZ']['Year'],
    y=pdf[pdf['Country Code']=='KAZ']['Balance'],
    name='KAZ',
    marker=dict(
                color='rgb(0,0,255)')
)

pdata = [trace1, trace2, trace3]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=pdata, layout=layout)
py.iplot(fig, filename='grouped-bar')

In [109]:
import plotly.plotly as py
import plotly.graph_objs as go
targets = ['JPN','AZE','KAZ']
x_axis = "Year"
y_axis = "Exports"
pdf = data[data['Country Code'].isin(targets)]
trace1 = go.Bar(
    x=pdf[pdf['Country Code']==targets[0]][x_axis],
    y=pdf[pdf['Country Code']==targets[0]][y_axis],
    name=targets[0],
    marker=dict(
                color='rgb(255,0,0)')
)
trace2 = go.Bar(
    x=pdf[pdf['Country Code']==targets[1]][x_axis],
    y=pdf[pdf['Country Code']==targets[1]][y_axis],
    name=targets[1],
    marker=dict(
                color='rgb(0,255,0)')
)
trace3 = go.Bar(
    x=pdf[pdf['Country Code']==targets[2]][x_axis],
    y=pdf[pdf['Country Code']==targets[2]][y_axis],
    name=targets[2],
    marker=dict(
                color='rgb(0,0,255)')
)

pdata = [trace1, trace2, trace3]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=pdata, layout=layout)
py.iplot(fig, filename='grouped-bar')

#### Raw Correlation Analysis

In [96]:
df = data[['Balance','Year','Inflation','GDP','Exports','Trade']]
rs = np.random.RandomState(0)
corr = df.corr()
corr.style.background_gradient()

Unnamed: 0,Balance,Year,Inflation,GDP,Exports,Trade,syn1
Balance,1.0,0.0111765,0.000919432,0.0201054,-0.0146729,0.0764567,0.00115275
Year,0.0111765,1.0,-0.0455437,-0.00996952,0.213872,0.066026,0.016557
Inflation,0.000919432,-0.0455437,1.0,-0.0912275,-0.0207551,-0.0246931,-0.592371
GDP,0.0201054,-0.00996952,-0.0912275,1.0,-0.0328561,0.105416,0.137726
Exports,-0.0146729,0.213872,-0.0207551,-0.0328561,1.0,-0.0308244,0.00540507
Trade,0.0764567,0.066026,-0.0246931,0.105416,-0.0308244,1.0,0.0224598
syn1,0.00115275,0.016557,-0.592371,0.137726,0.00540507,0.0224598,1.0


### Conclusions
##### 1 . Dataset contains Multiple Entries for a country with Balance for Each Year
##### 2 . Unequal Number of samples for different countries
##### 3. Balance has positive correlation with Trade, GDP, Year and Inflation(Max: Trade). 
##### 4. Balance has negaticve correlation with Exports.