In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import sys
print(sys.version)

3.7.1 (default, Dec 10 2018, 22:54:23) [MSC v.1915 64 bit (AMD64)]


In [3]:
def make_features(ne, ua, cutoff_date = '2017-08-01'):
   
    ua['start_date']= pd.to_datetime(ua['start_date'])
    ua['end_date']= pd.to_datetime(ua['end_date'])
    cutoff_date= pd.to_datetime(cutoff_date)
    
    # Step A
    ua=ua.loc[ua['start_date'] <cutoff_date,:]
    ua['end_date']= ua['end_date'].apply(lambda x: cutoff_date - pd.Timedelta(seconds=1) if x > cutoff_date  else x)
    ua['duration_days']=(ua['end_date']-ua['start_date']).apply(lambda x: x.seconds)
    
    # Step B
    mean_duration_days= ua.groupby('ne_id')['duration_days'].apply(lambda x: np.nanmean(x.values)).values  
    mode_product_2 = ua.groupby('ne_id')['product_category_2'].apply(lambda x: stats.mode(x.values).mode[0]).values
    mode_product_3 = ua.groupby('ne_id')['product_category_3'].apply(lambda x: stats.mode(x.values).mode[0]).values
    ne_id= ua.groupby('ne_id')['ne_id'].unique().apply(lambda x: x[0]).values
    d = {'ne_id':ne_id,'mean_duration_days':mean_duration_days, 'mode_product_2': mode_product_2, 'mode_product_3': mode_product_3}
    ua_features = pd.DataFrame(data=d)
    
    ## Step C - add the features to the NE data frame ----
    ne_with_features= pd.merge(ne, ua_features, how='left', on='ne_id')
  
    ## Step D - replace NAs with sensible values ----
    ne_with_features['mean_duration_days']= ne_with_features['mean_duration_days'].fillna(0)
    ne_with_features['mode_product_2']= ne_with_features['mode_product_2'].fillna('no outages found')
    ne_with_features['mode_product_3']= ne_with_features['mode_product_3'].fillna('no outages found')

    return ne_with_features

In [4]:
def make_label(ne,ua,interval_start,interval_end):
    
    interval_start=pd.to_datetime(interval_start)
    interval_end=pd.to_datetime(interval_end)
    
    # Hold only uas in the given interval
    idx1 = (ua['start_date'] >= interval_start)  &  (ua['start_date'] <= interval_end )
    idx2 = (ua['end_date'] >= interval_start)  &  (ua['end_date'] <= interval_end) 
    idx3 = (ua['start_date'] < interval_start)  &  (ua['end_date'] > interval_end ) 
    idx_overall = idx1 | idx2 | idx3 
    unavailability_by_ne= ua[idx_overall]
    
    # Hold only one ua for singe ne 
    ua1=unavailability_by_ne.drop_duplicates( subset='ne_id' )
    ua1['label']=1
    
    ua2= ua1.loc[:,['ne_id','label']]
    ua2=ua1[['ne_id','label']]
    # Create the label for each network id
    ne_with_uas=pd.merge(ne, ua2, how='left', on='ne_id')
    ne_with_uas['label']=ne_with_uas['label'].fillna(0)
    
    return ne_with_uas

In [5]:
def create_train_holdout_test(ua,ne,cutoff_train='2017-06-01',cutoff_holdout='2017-07-01',cutoff_test='2017-08-01'):
    train=make_features(ne, ua, cutoff_date = cutoff_train)
    holdout = make_features(ne, ua, cutoff_holdout)
    test = make_features(ne, ua, cutoff_test)
  
    train = make_label(train, ua, interval_start = cutoff_train, interval_end = cutoff_holdout)
    holdout = make_label(holdout, ua, interval_start = cutoff_holdout, interval_end = cutoff_test)
  
    return train, holdout ,test


In [6]:
ua=pd.read_csv('unavailabilities copy.csv',sep=',')
ne=pd.read_csv('network_elements copy.csv',sep=',')
cutoff_train =   '2017-06-01'
cutoff_holdout = '2017-07-01'
cutoff_test =    '2017-08-01' # this is the end of your data. You should not change this date.



# this can take a bit of time (~5-10 mins on i7, depending on number and types of features)
train,holdout,test = create_train_holdout_test(ua, ne, cutoff_train, cutoff_holdout, cutoff_test)
train.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


TypeError: '<' not supported between instances of 'float' and 'str'

In [461]:
train.to_csv('train.csv',index=False)
holdout.to_csv('holdout.csv',index=False)
test.to_csv('test.csv',index=False)

In [462]:
train=pd.read_csv('train.csv',sep=',')
holdout=pd.read_csv('holdout.csv',sep=',')
test=pd.read_csv('test.csv',sep=',')

train.columns

Index(['ne_id', 'ne_name', 'site_id', 'controller_id', 'origin_net',
       'technology', 'n_cells', 'city', 'zip_code', 'location_type',
       'urbanity', 'latitude', 'longitude', 'MNC', 'LAC', 'TAC', 'frequency',
       'antenna_type', 'mean_duration_days', 'mode_product_2',
       'mode_product_3', 'label'],
      dtype='object')

In [463]:
columns_drop =['ne_id', 'ne_name', 'site_id', 'controller_id', 'city','zip_code','LAC',
               'TAC','antenna_type','MNC','latitude','longitude']
train = train.drop(columns_drop,axis=1)
holdout = holdout.drop(columns_drop,axis=1)
test = test.drop(columns_drop,axis=1)

In [464]:
train.isna().any()

origin_net            False
technology            False
n_cells                True
location_type          True
urbanity               True
frequency              True
mean_duration_days    False
mode_product_2        False
mode_product_3        False
label                 False
dtype: bool

In [465]:
train.dtypes

origin_net             object
technology             object
n_cells               float64
location_type          object
urbanity               object
frequency             float64
mean_duration_days    float64
mode_product_2         object
mode_product_3         object
label                 float64
dtype: object

In [466]:
mean_n_cells= np.nanmean(train['n_cells'])

mean_frequency= np.nanmean(train['frequency'])


train['n_cells'] =train['n_cells'].fillna(mean_n_cells)
train['location_type'] = train['location_type'].fillna('unknown')
train['urbanity'] = train['location_type'].fillna('unknown')
train['frequency'] = train['frequency'].fillna(mean_frequency)

holdout['n_cells'] =train['n_cells'].fillna(mean_n_cells)
holdout['location_type'] = train['location_type'].fillna('unknown')
holdout['urbanity'] = train['location_type'].fillna('unknown')
holdout['frequency'] = train['frequency'].fillna(mean_frequency)

test['n_cells'] =train['n_cells'].fillna(mean_n_cells)
test['location_type'] = train['location_type'].fillna('unknown')
test['urbanity'] = train['location_type'].fillna('unknown')
test['frequency'] = train['frequency'].fillna(mean_frequency)

In [467]:
for col in train:
    print(train[col].unique())
train.columns

['T' 'E']
['3g' '4g' '2g']
[ 6.       4.77419  9.      12.       3.       8.       4.       2.
 18.       1.      16.       5.      10.     ]
['SITE' 'unknown' 'POLE' 'TOWER' 'BUILDING']
['SITE' 'unknown' 'POLE' 'TOWER' 'BUILDING']
[2100.         1449.17247387  800.         2600.         1800.
  900.        ]
[13321.5 18460.      0.  ... 21300.6 19582.  21072.5]
['Network' 'Others' 'no outages found' 'Switch' 'Multiplex' 'Connectivity'
 'Mobile' '0' 'Other' 'Facilities' 'Endpoint' 'Synchronization']
['Ethernet Microwave' 'IP Support' 'no outages found' 'Node B' 'BS' 'DXX'
 'SDH' 'LTE enode' 'BSC' 'RNC' 'ATM Switch' 'Special-Use-Device' 'RR'
 'DCN-Equipment' 'Cross Connect' 'MUX' '0' 'Mobile' 'Microwave PDH'
 'Mobilfunk Repeater' 'Other' 'PMP Central Station' 'Router' 'Location'
 'Microwave SDH' 'WDM' 'Microwave Link' 'Leased Line' 'Interworking'
 'GPS Clock' 'Master Unit for optical fibre repeaters GSM']
[0. 1.]


Index(['origin_net', 'technology', 'n_cells', 'location_type', 'urbanity',
       'frequency', 'mean_duration_days', 'mode_product_2', 'mode_product_3',
       'label'],
      dtype='object')

In [468]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

encoder_One_Hot= LabelEncoder() #OneHotEncoder()

#encoder_Ordinal_tech= OrdinalEncoder(categories=['2g', '3g', '4g'])

train_origin_net=train['origin_net'].values.reshape(-1, 1)
train_technology=train['technology'].values.reshape(-1, 1)
train_location_type =train['location_type'].values.reshape(-1, 1)
train_urbanity =train['urbanity'].values.reshape(-1, 1)
train_mode_product_2=train['mode_product_2'].values.reshape(-1, 1)
train_mode_product_3 =train['mode_product_3'].values.reshape(-1, 1)

#train['technology']=encoder_Ordinal_tech.fit_transform(train_origin_net)

train['origin_net']= encoder_One_Hot.fit_transform(train_origin_net)
train['location_type'] = encoder_One_Hot.fit_transform(train_location_type)
train['urbanity'] = encoder_One_Hot.fit_transform(train_urbanity)
train['mode_product_2'] = encoder_One_Hot.fit_transform(train_mode_product_2)
train['mode_product_3'] = encoder_One_Hot.fit_transform(train_mode_product_3)
train['technology']= encoder_One_Hot.fit_transform(train_technology)


holdout_origin_net=holdout['origin_net'].values.reshape(-1, 1)
holdout_technology=holdout['technology'].values.reshape(-1, 1)
holdout_location_type =holdout['location_type'].values.reshape(-1, 1)
holdout_urbanity =holdout['urbanity'].values.reshape(-1, 1)
holdout_mode_product_2=holdout['mode_product_2'].values.reshape(-1, 1)
holdout_mode_product_3 =holdout['mode_product_3'].values.reshape(-1, 1)

#train['technology']=encoder_Ordinal_tech.fit_transform(train_origin_net)

holdout['origin_net']= encoder_One_Hot.fit_transform(holdout_origin_net)
holdout['location_type'] = encoder_One_Hot.fit_transform(holdout_location_type)
holdout['urbanity'] = encoder_One_Hot.fit_transform(holdout_urbanity)
holdout['mode_product_2'] = encoder_One_Hot.fit_transform(holdout_mode_product_2)
holdout['mode_product_3'] = encoder_One_Hot.fit_transform(holdout_mode_product_3)
holdout['technology']= encoder_One_Hot.fit_transform(holdout_technology)

  y = column_or_1d(y, warn=True)


In [469]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

y_train= train['label']
x_train= train.drop('label',axis=1)

y_holdout = train['label'] 
x_holdout = holdout.drop('label',axis=1)


x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, random_state=324)

x_holdout, x_test, y_holdout, y_test = train_test_split(x_holdout, y_holdout, test_size=0.0, random_state=324)


In [470]:
model= DecisionTreeClassifier(max_leaf_nodes=10, random_state=0)
model.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=10,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [471]:
from sklearn.metrics import accuracy_score

predictions=model.predict(x_holdout)
accuracy_score(y_true = y_holdout, y_pred = predictions)


0.8837264190704183