## Python coding style and numpy basics

In [11]:
import numpy as np

In [12]:
# The Zen of Python
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [13]:
abc = np.arange(12).reshape(3,4)
abc

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [14]:
print(abc.sum())
print("  ")
print(abc.sum(axis=0))
print("  ")
print(abc.sum(axis=1))

66
  
[12 15 18 21]
  
[ 6 22 38]


In [15]:
?np.random.rand

## Use np.Vectorize to return functions

In [16]:
@np.vectorize
def noneg(n):
    if not np.isnan(n) and n <0:
        return n.__class__(-1)
    else:
        return n

In [17]:
v = np.array([-1,3,5,-33,2])
noneg(-3)
noneg(v)

array([-1,  3,  5, -1,  2])

In [18]:
v1 = np.array([-1, 2, np.nan])

In [19]:
noneg(v1)

array([ -1.,   2.,  nan])

In [20]:
np.isnan(np.nan)

True

In [21]:
np.isnan(np.nan)

True

## Use Pandas Functions and usages
* Pandas read files
* pandas operations

The formula for logloss is $1/1+exp(-w.t*x)$

In [22]:
from os import path
fname = path.expanduser('~/Desktop_Abhik/Coursera_Advanced_ML_courses/Course_2_Kaggle_competiitions/kaggle_challenge_data/sales_train.csv')

In [23]:
!ls -lh "$fname"

-rw-r--r--@ 1 abhikbanerjee  staff    90M Feb 26 22:01 /Users/abhikbanerjee/Desktop_Abhik/Coursera_Advanced_ML_courses/Course_2_Kaggle_competiitions/kaggle_challenge_data/sales_train.csv


In [24]:
# get the path size of the filename
path.getsize(fname)/(1<<10)

92386.587890625

In [25]:
! head "$fname"

date,date_block_num,shop_id,item_id,item_price,item_cnt_day
02.01.2013,0,59,22154,999.0,1.0
03.01.2013,0,25,2552,899.0,1.0
05.01.2013,0,25,2552,899.0,-1.0
06.01.2013,0,25,2554,1709.05,1.0
15.01.2013,0,25,2555,1099.0,1.0
10.01.2013,0,25,2564,349.0,1.0
02.01.2013,0,25,2565,549.0,1.0
04.01.2013,0,25,2572,239.0,1.0
11.01.2013,0,25,2572,299.0,1.0


In [26]:
# get the first 10 lines of the file
with open(fname) as fp:
    for i,line in enumerate(fp):
        if i>10:
            break
        print(line[:-1]) # remove the new line at the end

date,date_block_num,shop_id,item_id,item_price,item_cnt_day
02.01.2013,0,59,22154,999.0,1.0
03.01.2013,0,25,2552,899.0,1.0
05.01.2013,0,25,2552,899.0,-1.0
06.01.2013,0,25,2554,1709.05,1.0
15.01.2013,0,25,2555,1099.0,1.0
10.01.2013,0,25,2564,349.0,1.0
02.01.2013,0,25,2565,549.0,1.0
04.01.2013,0,25,2572,239.0,1.0
11.01.2013,0,25,2572,299.0,1.0
03.01.2013,0,25,2573,299.0,3.0


In [27]:
!wc -l "$fname"

 2935850 /Users/abhikbanerjee/Desktop_Abhik/Coursera_Advanced_ML_courses/Course_2_Kaggle_competiitions/kaggle_challenge_data/sales_train.csv


In [28]:
with open(fname) as fp:
    print(sum(1 for line in fp))

2935850


In [29]:
import pandas as pd

df = pd.read_csv(fname, parse_dates=['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
date              datetime64[ns]
date_block_num    int64
shop_id           int64
item_id           int64
item_price        float64
item_cnt_day      float64
dtypes: datetime64[ns](1), float64(2), int64(3)
memory usage: 134.4 MB


In [30]:
df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-02-01,0,59,22154,999.0,1.0
1,2013-03-01,0,25,2552,899.0,1.0
2,2013-05-01,0,25,2552,899.0,-1.0
3,2013-06-01,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


In [31]:
df.dtypes

date              datetime64[ns]
date_block_num             int64
shop_id                    int64
item_id                    int64
item_price               float64
item_cnt_day             float64
dtype: object

In [32]:
df.iloc[0:2]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-02-01,0,59,22154,999.0,1.0
1,2013-03-01,0,25,2552,899.0,1.0


In [33]:
df.index

RangeIndex(start=0, stop=2935849, step=1)

### Use PyTz for timezone conversions 

In [34]:
df.index = df['date']

In [35]:
df.index

DatetimeIndex(['2013-02-01', '2013-03-01', '2013-05-01', '2013-06-01',
               '2013-01-15', '2013-10-01', '2013-02-01', '2013-04-01',
               '2013-11-01', '2013-03-01',
               ...
               '2015-10-24', '2015-10-31', '2015-11-10', '2015-10-10',
               '2015-09-10', '2015-10-10', '2015-09-10', '2015-10-14',
               '2015-10-22', '2015-03-10'],
              dtype='datetime64[ns]', name='date', length=2935849, freq=None)

In [36]:
import pytz

In [37]:
df['date'].tz_localize(pytz.UTC).tz_convert(pytz.timezone('Asia/Jerusalem')).head()

date
2013-02-01 02:00:00+02:00   2013-02-01
2013-03-01 02:00:00+02:00   2013-03-01
2013-05-01 03:00:00+03:00   2013-05-01
2013-06-01 03:00:00+03:00   2013-06-01
2013-01-15 02:00:00+02:00   2013-01-15
Name: date, dtype: datetime64[ns]

In [38]:
import sys
sys.path

['',
 '/Users/abhikbanerjee/anaconda3/lib/python36.zip',
 '/Users/abhikbanerjee/anaconda3/lib/python3.6',
 '/Users/abhikbanerjee/anaconda3/lib/python3.6/lib-dynload',
 '/Users/abhikbanerjee/anaconda3/lib/python3.6/site-packages',
 '/Users/abhikbanerjee/anaconda3/lib/python3.6/site-packages/aeosa',
 '/Users/abhikbanerjee/anaconda3/lib/python3.6/site-packages/torchvision-0.2.0-py3.6.egg',
 '/Users/abhikbanerjee/anaconda3/lib/python3.6/site-packages/IPython/extensions',
 '/Users/abhikbanerjee/.ipython']

In [39]:
!pwd

/Users/abhikbanerjee/Open_Source_Ml_Medium


In [1]:
import pandas as pd
sci_df = pd.read_csv("./science_federal_giving.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
sci_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881793 entries, 0 to 881792
Data columns (total 31 columns):
cmte_nm                 470203 non-null object
cmte_id                 470203 non-null object
cmte_tp                 470203 non-null object
cmte_pty                469997 non-null object
cand_name               383801 non-null object
cand_pty_affiliation    383796 non-null object
cand_office_st          383801 non-null object
cand_office             383801 non-null object
cand_office_district    383763 non-null float64
cand_status             383717 non-null object
rpt_tp                  470203 non-null object
transaction_pgi         430476 non-null object
transaction_tp          470203 non-null object
entity_tp               470157 non-null object
cleaned_name            470177 non-null object
city                    470125 non-null object
state                   469543 non-null object
zip_code                469642 non-null object
employer                467864 non-null ob

In [42]:
print(id(sci_df)) # gives the memory address
print(type(sci_df))  # gives the type

4937153280
<class 'pandas.core.frame.DataFrame'>


In [43]:
print(id(df)) # gives the memory address
print(type(df))  # gives the type

5995642720
<class 'pandas.core.frame.DataFrame'>


In [46]:
print(df.__doc__)
print(dir(df))

 Two-dimensional size-mutable, potentially heterogeneous tabular data
    structure with labeled axes (rows and columns). Arithmetic operations
    align on both row and column labels. Can be thought of as a dict-like
    container for Series objects. The primary pandas data structure

    Parameters
    ----------
    data : numpy ndarray (structured or homogeneous), dict, or DataFrame
        Dict can contain Series, arrays, constants, or list-like objects
    index : Index or array-like
        Index to use for resulting frame. Will default to np.arange(n) if
        no indexing information part of input data and no index provided
    columns : Index or array-like
        Column labels to use for resulting frame. Will default to
        np.arange(n) if no column labels are provided
    dtype : dtype, default None
        Data type to force, otherwise infer
    copy : boolean, default False
        Copy data from inputs. Only affects DataFrame / 2d ndarray input

    Examples
    ---

In [47]:
globals()

{'In': ['',
  'import pandas as pd\nsci_df = pd.read_csv("./science_federal_giving.csv")',
  'sci_df.head()',
  'sci_df.transpose().head()',
  'sci_df.head()',
  'sci_df.columns',
  'sci_df.info()',
  'sci_df.summary()',
  'sci_df.summary',
  'sci_df.dtypes',
  "df['date'].tz_localize(pytz.UTC).tz_convert(pytz.timezone('Asia/Jerusalem')).head()",
  'import numpy as np',
  '# The Zen of Python\nimport this',
  'abc = np.arange(12).reshape(3,4)\nabc',
  'print(abc.sum())\nprint("  ")\nprint(abc.sum(axis=0))\nprint("  ")\nprint(abc.sum(axis=1))',
  "get_ipython().magic('pinfo np.random.rand')",
  '@np.vectorize\ndef noneg(n):\n    if not np.isnan(n) and n <0:\n        return n.__class__(-1)\n    else:\n        return n',
  'v = np.array([-1,3,5,-33,2])\nnoneg(-3)\nnoneg(v)',
  'v1 = np.array([-1, 2, np.nan])',
  'noneg(v1)',
  'np.isnan(np.nan)',
  'np.isnan(np.nan)',
  "from os import path\nfname = path.expanduser('~/Desktop_Abhik/Coursera_Advanced_ML_courses/Course_2_Kaggle_competiition

In [56]:
with open("./science_federal_giving.csv","r") as file:
    file_data = file.read()
    print(file_data.count('.'))

950294


In [4]:
import pandas as pd
df = pd.read_csv("/Users/abhikbanerjee/auto-ml/joined_data_flatenner_20180321")
df.head(n=50)

Unnamed: 0,[riid],hash_email,email_permission,email_deliver,email_domain,email_isp,mob_permission,mob_deliverability,city,state,postal_code,country,created_date,modified_date,acc_id,description,event_dt,launch_status,launch_type,new_event_type
0,65133942,f394cd28c5be8c22bd592741c8c6209abb43d242924e8...,I,D,gmail.com,Gmail,I,D,,,,,2018-03-14 14:26:07,2018-03-14 14:26:07,9653.0,AUS - Join Day to Accept,,L,S,0.0
1,65133942,f394cd28c5be8c22bd592741c8c6209abb43d242924e8...,I,D,gmail.com,Gmail,I,D,,,,,2018-03-14 14:26:07,2018-03-14 14:26:07,9653.0,AUS - Join Day to Accept,,B,S,0.0
2,65133942,f394cd28c5be8c22bd592741c8c6209abb43d242924e8...,I,D,gmail.com,Gmail,I,D,,,,,2018-03-14 14:26:07,2018-03-14 14:26:07,9653.0,AUS - Join Day to Accept,,C,S,0.0
3,62495502,fb670cc67c6aa762032f50fcba7d92606888299401eef...,I,D,gmail.com,Gmail,O,D,,,,,2018-02-14 14:38:01,2018-03-14 14:19:14,9653.0,AUS - Join Day to Accept,,L,S,0.0
4,62495502,fb670cc67c6aa762032f50fcba7d92606888299401eef...,I,D,gmail.com,Gmail,O,D,,,,,2018-02-14 14:38:01,2018-03-14 14:19:14,9653.0,AUS - Join Day to Accept,,B,S,0.0
5,62495502,fb670cc67c6aa762032f50fcba7d92606888299401eef...,I,D,gmail.com,Gmail,O,D,,,,,2018-02-14 14:38:01,2018-03-14 14:19:14,9653.0,AUS - Join Day to Accept,,C,S,0.0
6,57869862,1aba7f388de0b2e2b55a8dcdcba5897ff26395a93dd01...,I,D,gmail.com,Gmail,I,D,,,,,2018-01-17 18:35:56,2018-03-14 14:56:15,9653.0,AUS - Join Day to Accept,,L,S,0.0
7,57869862,1aba7f388de0b2e2b55a8dcdcba5897ff26395a93dd01...,I,D,gmail.com,Gmail,I,D,,,,,2018-01-17 18:35:56,2018-03-14 14:56:15,9653.0,AUS - Join Day to Accept,,B,S,0.0
8,57869862,1aba7f388de0b2e2b55a8dcdcba5897ff26395a93dd01...,I,D,gmail.com,Gmail,I,D,,,,,2018-01-17 18:35:56,2018-03-14 14:56:15,9653.0,AUS - Join Day to Accept,,C,S,0.0
9,30875862,4bfc2f52a5845d241d3688c8415e1c6645bb1a23bd2a5...,I,D,gmail.com,Gmail,I,D,,,,,2017-04-17 03:35:00,2018-03-14 12:48:57,9653.0,,,B,S,0.0
