In [None]:
import pandas as pd
import matplotlib.pyplot

### Getting DataFrame object:

In [None]:
# series - 1d object
book_series = pd.Series(["War and Peace I", "War and Peace II", "War and Peace III", "War and Peace IV"])
book_series

In [None]:
pages_series = pd.Series([420, 350, 400, 380])
pages_series

In [None]:
# dataframe - 2d object
df_books = pd.DataFrame({'name of the book': book_series, 'number of pages': pages_series})
df_books

In [None]:
# import from a file:
df_movies = pd.read_csv('data/Walt_disney_movie_dataset.csv')
df_movies

In [None]:
# export into csv:
df_movies.to_csv('data/movies_edited.csv', index=False)

### Getting general info about df:

In [None]:
df_movies.dtypes

In [None]:
df_movies.columns

In [None]:
df_movies.index

In [None]:
df_movies.info()

In [None]:
len(df_movies)

In [None]:
df_movies.head(4)

In [None]:
df_movies.tail(4)

In [None]:
df_movies.describe()

In [None]:
df_movies.mean()

In [None]:
df_movies["Running time (int)"].sum()

In [None]:
budget_series = pd.Series([1000, 3000, 10000, 2500])
print(budget_series.mean())
print(budget_series.sum())

### Viewing and selecting data:

#### Selecting rows:

In [None]:
# .iloc[] and .loc[] properties

# .iloc[] picks up a row at a certain position 

# .loc[] picks up a row with a given index,
#  returns several rows in case there are multiple rows with the same index

# works with slicing

In [None]:
pets = pd.Series(["cat", "dog", "bird", "fish", "cow"])
pets 

In [None]:
pets.iloc[2]

In [None]:
pets.loc[2]

In [None]:
pets = pd.Series(
    ["cat", "dog", "bird", "fish", "cow"], index=[10, 20, 10, 40, 50]
)
pets

In [None]:
pets.iloc[2]

In [None]:
pets.loc[10]

In [None]:
pets.iloc[:2]

In [None]:
pets.iloc[2:]

In [None]:
df_movies.iloc[:5]

#### Selecting columns:

In [None]:
df_movies["Running time (int)"]

In [None]:
df_movies["title"]

In [None]:
df_movies.title

In [None]:
df_movies["Language"]

In [None]:
df_movies["Running time (int)"]

#### Filtering data in a column:

In [None]:
df_movies[df_movies["Running time (int)"] > 100]

In [None]:
len(df_movies[df_movies["Running time (int)"] > 100])

In [None]:
len(df_movies[df_movies["Running time (int)"] > 120])

### Comparing columns:

In [None]:
df_cars= pd.read_csv('data/car-sales.csv')
df_cars

In [None]:
pd.crosstab(df_cars["Make"], df_cars["Doors"])

In [None]:
df_cars.groupby(["Make"]).mean()

In [None]:
df_cars.groupby(["Make", "Colour"]).mean()

### Simple and quick visualisation:

In [None]:
%matplotlib inline

In [None]:
df_cars["Odometer (KM)"].plot()

In [None]:
df_cars["Odometer (KM)"].hist()

### Converting price to int:

In [None]:
df_cars["Price"]

In [None]:
df_cars["Price"].dtype    # dtype('O') stands for object type, not int

In [None]:
df_cars["Price"] = df_cars["Price"].str.replace('[\$\,\.]', '').astype(int)/100

In [None]:
df_cars["Price"].hist()

### Data manipulation:

#### String methods:

In [None]:
df_cars["Make"]    # df_cars.Make

In [None]:
df_cars["Make"].str.lower()    # doesn't change the original df in place

In [None]:
df_cars["Make"].str.upper()    # doesn't change the original df in place

In [None]:
# one needs to reassign the column to make a change in the original df 
df_cars

In [None]:
df_cars["Make"] = df_cars["Make"].str.lower()
df_cars

### Handling missing data:

#### Replacing NaN with some values:

In [None]:
df_cars_missing = pd.read_csv('data/car-sales-missing-data.csv')
df_cars_missing

In [None]:
df_cars_missing.Odometer = df_cars_missing.Odometer.fillna(df_cars_missing.Odometer.mean())
df_cars_missing

#### Dropping NaN from df:

In [None]:
df_cars_missing = pd.read_csv('data/car-sales-missing-data.csv')
df_cars_missing

In [None]:
df_cars_dropped = df_cars_missing.dropna()
df_cars_dropped

In [None]:
df_cars_missing

### Changing df in place with inplace parameter:

In [None]:
df_cars_missing = pd.read_csv('data/car-sales-missing-data.csv')
df_cars_missing 

In [None]:
df_cars_missing.Odometer.fillna(df_cars_missing.Odometer.mean(), inplace=True)    # by default inplace=False
df_cars_missing

In [None]:
df_cars_missing.dropna(inplace=True)
df_cars_missing