## Python VS R

### Import libraries
*Python*:

In [73]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

*R*:
```r
require(tidyverse)
```

### Read and Write

#### Read zipped files

*Python*:
```python
df = pd.read_csv('Homo_sapiens.GRCh38.85.gff3.gz', 
                         compression = 'gzip',
                         sep = '\t', 
                         comment = '#', 
                         low_memory = False,
                         header = None, 
                         names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
df.head()
```

*R*:
```r
df <- read.csv("~/Documents/Github/Homo_sapiens.GRCh38.85.gff3.gz", 
               header = FALSE, 
               sep = "\t", 
               col.names = c('seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'), 
               comment.char = "#")
head(df)
```

#### Read file
*Python:*

In [74]:
df = pd.read_table('fruit_data_with_colors.txt')
df_h = pd.read_csv('City_Zhvi_AllHomes.csv')

In [75]:
df.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [76]:
df_h.head()

Unnamed: 0,RegionID,RegionName,State,Metro,CountyName,SizeRank,1996-04,1996-05,1996-06,1996-07,...,2015-11,2015-12,2016-01,2016-02,2016-03,2016-04,2016-05,2016-06,2016-07,2016-08
0,6181,New York,NY,New York,Queens,1,,,,,...,573600,576200,578400,582200,588000,592200,592500,590200,588000,586400
1,12447,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,2,155000.0,154600.0,154400.0,154200.0,...,558200,560800,562800,565600,569700,574000,577800,580600,583000,585100
2,17426,Chicago,IL,Chicago,Cook,3,109700.0,109400.0,109300.0,109300.0,...,207800,206900,206200,205800,206200,207300,208200,209100,211000,213000
3,13271,Philadelphia,PA,Philadelphia,Philadelphia,4,50000.0,49900.0,49600.0,49400.0,...,122300,121600,121800,123300,125200,126400,127000,127400,128300,129100
4,40326,Phoenix,AZ,Phoenix,Maricopa,5,87200.0,87700.0,88200.0,88400.0,...,183800,185300,186600,188000,189100,190200,191300,192800,194500,195900


### Data querying and sampling

In [77]:
df.shape

(59, 7)

In [79]:
df[(df.width < 6) & (df.height > 6)]

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
49,4,lemon,unknown,132,5.8,8.7,0.73
52,4,lemon,unknown,118,5.9,8.0,0.72
56,4,lemon,unknown,116,5.9,8.1,0.73


### Column

#### by iloc:

In [80]:
df_h.iloc[:,[0,1,2,3]].head()  # by num

Unnamed: 0,RegionID,RegionName,State,Metro
0,6181,New York,NY,New York
1,12447,Los Angeles,CA,Los Angeles-Long Beach-Anaheim
2,17426,Chicago,IL,Chicago
3,13271,Philadelphia,PA,Philadelphia
4,40326,Phoenix,AZ,Phoenix


In [81]:
df_h.iloc[:,0:3].head()  # by range

Unnamed: 0,RegionID,RegionName,State
0,6181,New York,NY
1,12447,Los Angeles,CA
2,17426,Chicago,IL
3,13271,Philadelphia,PA
4,40326,Phoenix,AZ


In [82]:
df_h.iloc[:,df_h.columns == '2015-11'].head()    # by masking

Unnamed: 0,2015-11
0,573600
1,558200
2,207800
3,122300
4,183800


In [83]:
df_h.iloc[:,df_h.columns.isin(['RegionName', 'State', '1996-08', '1996-09'])].head()    # by masking

Unnamed: 0,RegionName,State,1996-08,1996-09
0,New York,NY,,
1,Los Angeles,CA,154100.0,154300.0
2,Chicago,IL,109100.0,109000.0
3,Philadelphia,PA,49400.0,49300.0
4,Phoenix,AZ,88500.0,88900.0


#### by loc:

In [84]:
df_h.loc[:,['RegionName', 'State', '1996-08', '1996-09']].head() # by name

Unnamed: 0,RegionName,State,1996-08,1996-09
0,New York,NY,,
1,Los Angeles,CA,154100.0,154300.0
2,Chicago,IL,109100.0,109000.0
3,Philadelphia,PA,49400.0,49300.0
4,Phoenix,AZ,88500.0,88900.0


In [85]:
df_h.loc[:,'RegionName' : '1996-09'].head() # by range

Unnamed: 0,RegionName,State,Metro,CountyName,SizeRank,1996-04,1996-05,1996-06,1996-07,1996-08,1996-09
0,New York,NY,New York,Queens,1,,,,,,
1,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,2,155000.0,154600.0,154400.0,154200.0,154100.0,154300.0
2,Chicago,IL,Chicago,Cook,3,109700.0,109400.0,109300.0,109300.0,109100.0,109000.0
3,Philadelphia,PA,Philadelphia,Philadelphia,4,50000.0,49900.0,49600.0,49400.0,49400.0,49300.0
4,Phoenix,AZ,Phoenix,Maricopa,5,87200.0,87700.0,88200.0,88400.0,88500.0,88900.0


#### by column name:

In [86]:
df_h[['RegionName', 'State', '1996-08', '1996-09']].head()

Unnamed: 0,RegionName,State,1996-08,1996-09
0,New York,NY,,
1,Los Angeles,CA,154100.0,154300.0
2,Chicago,IL,109100.0,109000.0
3,Philadelphia,PA,49400.0,49300.0
4,Phoenix,AZ,88500.0,88900.0


#### Drop

In [88]:
df_h.drop(['RegionName', 'State', '1996-08', '1996-09'],axis=1).head() # by name

Unnamed: 0,RegionID,Metro,CountyName,SizeRank,1996-04,1996-05,1996-06,1996-07,1996-10,1996-11,...,2015-11,2015-12,2016-01,2016-02,2016-03,2016-04,2016-05,2016-06,2016-07,2016-08
0,6181,New York,Queens,1,,,,,,,...,573600,576200,578400,582200,588000,592200,592500,590200,588000,586400
1,12447,Los Angeles-Long Beach-Anaheim,Los Angeles,2,155000.0,154600.0,154400.0,154200.0,154300.0,154200.0,...,558200,560800,562800,565600,569700,574000,577800,580600,583000,585100
2,17426,Chicago,Cook,3,109700.0,109400.0,109300.0,109300.0,109000.0,109600.0,...,207800,206900,206200,205800,206200,207300,208200,209100,211000,213000
3,13271,Philadelphia,Philadelphia,4,50000.0,49900.0,49600.0,49400.0,49300.0,49400.0,...,122300,121600,121800,123300,125200,126400,127000,127400,128300,129100
4,40326,Phoenix,Maricopa,5,87200.0,87700.0,88200.0,88400.0,89400.0,89700.0,...,183800,185300,186600,188000,189100,190200,191300,192800,194500,195900


In [94]:
type(fruits['fruit_label'])

pandas.core.series.Series

### Row

In [97]:
type(fruits['fruit_label'])

pandas.core.series.Series

### Functions

In [91]:
fruits.fruit_name.unique()

array(['apple', 'mandarin', 'orange', 'lemon'], dtype=object)

*R:*
```r
distinct(fruits,fruit_name)
```
Returns: a data.frame