# Covariance and Correlation

### Getting and Preparing the Data (Part 1) 

In [None]:
import pandas as pd

In [None]:
movie = pd.read_csv("movies_metadata.csv", low_memory= False)

In [None]:
movie

In [None]:
movie.info()

In [None]:
pd.to_datetime(movie.release_date, errors = "coerce")

In [None]:
movie = movie.set_index(pd.to_datetime(movie.release_date, errors = "coerce")).drop(columns = ["release_date"])

In [None]:
movie.sort_index(inplace = True)

In [None]:
movie

In [None]:
df = movie.loc[:, ["title", "budget", "revenue"]].copy()

In [None]:
df

In [None]:
df.info()

In [None]:
df.budget = pd.to_numeric(df.budget, errors = "coerce")

### Getting and preparing the Data (Part 2) 

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.iloc[:, -2:]  = df.iloc[:, -2:] / 1000000

In [None]:
df

In [None]:
df.loc[df.title.isna()]

In [None]:
df.dropna(inplace = True)

In [None]:
df.info()

In [None]:
df.budget.value_counts()

In [None]:
df.revenue.value_counts()

In [None]:
df = df.loc[(df.revenue > 0) & (df.budget > 0)]

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.sort_values("budget", ascending = False)

In [None]:
df.sort_values("revenue", ascending = False)

In [None]:
df.to_csv("bud_vs_rev.csv")

### How to calculate Covariance and Correlation 

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("bud_vs_rev.csv", parse_dates = ["release_date"], index_col = "release_date")

In [None]:
df

In [None]:
df = df.loc["2016"]

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.mean(numeric_only=True)

In [None]:
df.var(numeric_only=True)

In [None]:
df.cov()

In [None]:
df.budget.cov(df.revenue)

In [None]:
df.corr()

In [None]:
df.budget.corr(df.revenue)

In [None]:
df.budget.cov(df.revenue) / (df.budget.std() * df.revenue.std())

In [None]:
np.cov(df.budget, df.revenue)

In [None]:
np.corrcoef(df.budget, df.revenue)

### Correlation and Scatterplots – visual Interpretation

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("bud_vs_rev.csv", parse_dates = ["release_date"], index_col = "release_date")

In [None]:
df = df.loc["2016"]

In [None]:
df

In [None]:
df.plot(kind = "scatter", x = "budget", y = "revenue", figsize = (15, 10), fontsize = 15)
plt.xlabel("Budget (in MUSD)", fontsize = 13)
plt.ylabel("Revenue (in MUSD)", fontsize = 13)
plt.show()

In [None]:
sns.set(font_scale=1.5)
sns.lmplot(data = df, x = "budget", y = "revenue", height = 8) # new instead of jointplot
plt.show()