In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from scipy.stats import spearmanr

In [3]:
test_df = pd.read_csv("recent_tests.csv", sep=';', dtype={'Sesssion': 'str'})

#Resetting types
test_df.astype({'Timestamp': 'str',
                'Client IP Block': 'str',
                'ASN': 'str', 
                'Country': 'str', 
                'NAT': 'str', 
                'Outbound Private Status': 'str',
               'Adjacent Spoof Prefix Length': 'str'}).dtypes

df_received = test_df.loc[test_df['Outbound Routable Status'] == 'received']

print(df_received.shape[0])
df_received.head()

5838


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Session,Timestamp,Client IP Block,ASN,Country,NAT,Outbound Private Status,Outbound Routable Status,Adjacent Spoof Prefix Length
6,966236,2020-08-31 23:48:14,103.132.11.x/24,138528 (EMPIRETECH-AS-AP),"khm (Cambodia, Kingdom of)",yes,received,received,/8
23,966208,2020-08-31 23:21:12,177.190.91.x/24,262977,bra (Brazil),yes,rewritten,received,none
44,966170,2020-08-31 22:18:40,2804:4e08:20xx::/40,268235,bra (Brazil),no,received,received,/16
53,966159,2020-08-31 22:00:03,2804:4e08:20xx::/40,268235,bra (Brazil),no,received,received,/16
55,966158,2020-08-31 21:59:07,2a02:27b0:45xx::/40,9146 (BIHNET),bih (Bosnia and Herzegovina),no,received,received,/16


In [4]:
remedies_df = pd.read_csv("remedies.csv", sep=';')

remedies_df.head()

Unnamed: 0,ASN,Country,IP Block,First Spoofed Timestamp,First Fixed Timestamp
0,268210,bra (Brazil),2804:4da4:xx::/64,2020-09-15 11:04:47,2020-09-15 11:15:36
1,263661,bra (Brazil),2804:1128:40xx::/64,2020-09-08 12:22:42,2020-09-11 13:27:57
2,263661,bra (Brazil),177.221.56.x/32,2020-07-31 22:01:58,2020-09-11 01:17:05
3,269362,bra (Brazil),45.184.241.x/32,2020-09-10 19:44:01,2020-09-10 19:48:38
4,269362,bra (Brazil),45.184.242.x/24,2020-09-04 18:00:18,2020-09-10 14:17:37


In [5]:
spoofable_df = df_received.groupby('Country')
spoofable_df = spoofable_df.count()[['Client IP Block']]
spoofable_df = spoofable_df.rename(columns={"Client IP Block" : "Spoofable Blocks"})

In [6]:
remedies_count_df = remedies_df.groupby('Country')
remedies_count_df = remedies_count_df.count()[['IP Block']]
remedies_count_df = remedies_count_df.rename(columns={"IP Block" : "Blocks Fixed"})

In [7]:
def calc_remediation_rates(countries):
    remediation_rates = []
    
    for country in countries:
        remedies_row = remedies_count_df.loc[country]
        spoofable_row = spoofable_df.loc[country]

        remediation_rate = (remedies_row['Blocks Fixed'] / (remedies_row['Blocks Fixed'] + spoofable_row['Spoofable Blocks'])*100.0)
        remediation_rates.append(remediation_rate)
        
    return remediation_rates

def calc_average(lst):
    return round(sum(lst)/len(lst),2)

In [8]:
unique_countries_remedies = list(remedies_count_df.index)
spoofable_country_list = list(spoofable_df.index)

unique_countries = [x for x in spoofable_country_list if x in unique_countries_remedies]

#print(len(unique_countries))
    
#data = {'Country': unique_countries,
#       'Remediation rate': remediation_rates}

#remediation_rate_df = pd.DataFrame(data, columns = ['Country', 'Remediation rate'])

europe = ['aut (Austria)',
          'bgr (Bulgaria)',
          'bih (Bosnia and Herzegovina)',
          'che (Switzerland)',
          'cze (Czech Republic)',
          'deu (Germany)',
          'esp (Spain)',
          'fra (France)',
          'gbr (United Kingdom)',
          'grc (Greece)',
          'hun (Hungary)',
          'irl (Ireland)',
          'ita (Italy)',
          'ltu (Lithuania)',
          'nld (Netherlands)',
          'nor (Norway)',
          'pol (Poland)',
          'prt (Portugal)',
          'reu (Reunion [French])',
          'rou (Romania)',
          'svn (Slovenia)',
          'swe (Sweden)',
          'ukr (Ukraine)' 
         ]

north_america = ['can (Canada)', 
                 'usa (United States)']

south_america = ['arg (Argentina)',
                 'bol (Bolivia)',
                 'bra (Brazil)',
                 'chl (Chile)',
                 'cri (Costa Rica)',
                 'hti (Haiti)',
                 'mex (Mexico)',
                 'pan (Panama)',
                 'pry (Paraguay)',
                 'ury (Uruguay)']

asia = ['bgd (Bangladesh)',
        'idn (Indonesia)',
        'ind (India)',
        'irn (Iran)',
        'irq (Iraq)',
        'isr (Israel)',
        'jpn (Japan)',
        'kaz (Kazakhstan)',
        'kor (South Korea)',
        'mmr (Myanmar)',
        'npl (Nepal)',
        'pak (Pakistan)',
        'phl (Philippines)',
        'rus (Russian Federation)',
        'tha (Thailand)',
        'tur (Turkey)',
        'twn (Taiwan)']

africa = ['ken (Kenya)', 
          'khm (Cambodia, Kingdom of)',
          'mar (Morocco)',
          'mus (Mauritius)',
          'tza (Tanzania)',
          'zaf (South Africa)']

australia = ['aus (Australia)', 'nzl (New Zealand)']

remediation_averages = []
europe_remediation_rates = calc_remediation_rates(europe)
remediation_averages.append(calc_average(europe_remediation_rates))

na_remediation_rates = calc_remediation_rates(north_america)
remediation_averages.append(calc_average(na_remediation_rates))

sa_remediation_rates = calc_remediation_rates(south_america)
remediation_averages.append(calc_average(sa_remediation_rates))

asia_remediation_rates = calc_remediation_rates(asia)
remediation_averages.append(calc_average(asia_remediation_rates))

africa_remediation_rates = calc_remediation_rates(africa)
remediation_averages.append(calc_average(africa_remediation_rates))

australia_remediation_rates = calc_remediation_rates(australia)
remediation_averages.append(calc_average(australia_remediation_rates))
                            
continents = ['Europe', 'North America', 'South America', 'Asia', 'Africa', 'Australia']

data = {'Continent': continents,
       'Remediation rates': remediation_averages}

continent_df = pd.DataFrame(data, columns=['Continent', 'Remediation rates'])

enc = LabelEncoder()
enc.fit(continent_df['Continent'])
continent_df['continent_encoded'] = enc.transform(continent_df['Continent'])

corr_continent_df = pd.DataFrame.corr(continent_df, method='spearman')
corr_continent_df.head()

Unnamed: 0,Remediation rates,continent_encoded
Remediation rates,1.0,-0.257143
continent_encoded,-0.257143,1.0


In [9]:
X = continent_df['continent_encoded']
Y = continent_df['Remediation rates']

model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 

print_model = model.summary()
print(print_model)

                                 OLS Regression Results                                
Dep. Variable:      Remediation rates   R-squared (uncentered):                   0.537
Model:                            OLS   Adj. R-squared (uncentered):              0.444
Method:                 Least Squares   F-statistic:                              5.790
Date:                Mon, 26 Oct 2020   Prob (F-statistic):                      0.0611
Time:                        11:55:15   Log-Likelihood:                         -26.433
No. Observations:                   6   AIC:                                      54.87
Df Residuals:                       5   BIC:                                      54.66
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                        coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------



In [11]:
#P value on national level considered statistically significant.

unique_countries_remedies = list(remedies_count_df.index)
spoofable_country_list = list(spoofable_df.index)

unique_countries = [x for x in spoofable_country_list if x in unique_countries_remedies]

unique_country_remediation = calc_remediation_rates(unique_countries)

data = {'Countries': unique_countries ,
       'Remediation rates': unique_country_remediation}

country_df = pd.DataFrame(data, columns=['Countries', 'Remediation rates'])

enc = LabelEncoder()
enc.fit(country_df['Countries'])
country_df['country_encoded'] = enc.transform(country_df['Countries'])

corr_country_df = pd.DataFrame.corr(country_df, method='spearman')
corr_country_df.head()

Unnamed: 0,Remediation rates,country_encoded
Remediation rates,1.0,0.038158
country_encoded,0.038158,1.0


In [12]:
X = country_df['country_encoded']
Y = country_df['Remediation rates']

model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 

print_model = model.summary()
print(print_model)

NameError: name 'df' is not defined