# More youtube's questions answered.

When reading the pandas documentation, it's important to distinguish between pandas functions. There are basically 3 types:
- Top-level functions
- DataFrame methods
- Series methods

In [3]:
import pandas as pd

In [6]:
ufo = pd.read_csv('http://bit.ly/uforeports')

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


#### pandas.top_level_function

In [4]:
# Top-level functions need to be called after pd. and will appear like this in the documentation:
# pandas.top_level_function
pd.isnull(ufo).head() # pandas.isnull(obj)

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,False,True,False,False,False
1,False,True,False,False,False
2,False,True,False,False,False
3,False,True,False,False,False
4,False,True,False,False,False


#### pandas.DataFrame.method

In [8]:
# DataFrame methods need to be called after DataFrame. and will appear like this in the documentation:
# pandas.DataFrame.method
ufo.head() # DataFrame.head(n=5)

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


#### pandas.Series.method

In [12]:
# Series methods need to be called after Series. and will appear like this in the documentation:
# pandas.Series.method
ufo.City.isnull().head() # Series.isnull()

0    False
1    False
2    False
3    False
4    False
Name: City, dtype: bool

### Sampling DataFrames

In [17]:
# The DataFrame.sample() method returns a random sample of items.
ufo.sample(n=3) # n= number of items return. by default n=1.

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
7978,Murphy,,,NC,3/30/1995 23:32
8619,Mukilteo,YELLOW,,WA,8/31/1995 21:50
4653,Bartlesville,,,OK,11/29/1981 0:30


In [21]:
# For reproducibility we can use the parameter random_state which will return the same sample as long the same seed
# (integer) in used.
ufo.sample(n=3,random_state=42) # random_state created the same sample as https://youtu.be/oH3wYKvwpJ8.

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
217,Norridgewock,,DISK,ME,9/15/1952 14:00
12282,Ipava,,TRIANGLE,IL,10/1/1998 21:15
17933,Ellinwood,,FIREBALL,KS,11/13/2000 22:00


In [27]:
# We can sample a fraction of item using the parameter frac=
ufo.sample(frac=0.75, random_state=99).head() # frac=0.75 for 75% of rows

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
6250,Sunnyvale,,OTHER,CA,12/16/1989 0:00
8656,Corpus Christi,,,TX,9/13/1995 0:10
2729,Mentor,,DISK,OH,8/8/1974 10:00
7348,Wilson,,LIGHT,WI,6/1/1994 1:00
12637,Lowell,,CIRCLE,MA,11/26/1998 10:00


In [47]:
# We can access the other 25% of rows not used in the example before like this.
train = ufo.sample(frac=0.75, random_state=99) # assigning the 75% rows to a variable train

test = ufo.loc[~ufo.index.isin(train.index)]
# using loc to access the 25% rows that are not in train
# the boolean ~ negates the code ufo.index.isin(train.index), which means
# find in the ufo DataFrame the rows that share the index with the rows in the train DataFrame
test.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00
5,Valley City,,DISK,ND,9/15/1934 15:30
8,Eklutna,,CIGAR,AK,10/15/1936 17:00
11,Waterloo,,FIREBALL,AL,6/1/1939 20:00
13,Keokuk,,OVAL,IA,7/7/1939 2:00


In [55]:
print(ufo.sample(frac=0.75).shape)
print(train.shape) # train has 75% of the rows
print(ufo.sample(frac=0.25).shape)
print(test.shape) # train has 25% of the rows

(13681, 5)
(13681, 5)
(4560, 5)
(4560, 5)
