# Netflix IMDB Scores visualization
## Step 1: Set up an environment

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from textblob import TextBlob

## Step 2: Import, understanding, cleaning the data

In [None]:
# Import Netflix data
df = pd.read_csv('/kaggle/input/netflix-imdb-scores/Netflix TV Shows and Movies.csv')

# Print head of DataFrame
df.head()

In [None]:
# DataFrame info
df.info()

In [None]:
# DataFrame describe
df.describe()

In [None]:
# Drop unusable columns
df = df.drop(columns=['index', 'id', 'imdb_id'])

df.head()

## Step 3: Data visualization

### General Trends and Distribution:

#### 1. Distribution of TV Shows and Movies

In [None]:
sns.countplot(x='type', data=df, hue='type', palette='Set2')
plt.title('Distribution of TV Shows and Movies')
plt.show()

#### 2. Distribution of IMDb Scores for TV Shows and Movies

In [None]:
plt.subplot(1, 2, 2)
sns.violinplot(x='type', y='imdb_score', hue='type', legend=False, data=df, palette='viridis')
plt.title('Distribution of IMDb Scores for Movies and TV Shows')
plt.xlabel('Type')
plt.ylabel('IMDb Score')

plt.tight_layout()
plt.show()

#### 3. Age Certification Distribution

In [None]:
plt.figure(figsize=(10, 6))

sns.countplot(x='age_certification', hue='type', data=df, palette='muted')
plt.title('Age Certification Distribution by type')
plt.tight_layout()
plt.show()

#### 4. Release year Distribution

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(x='release_year', bins=20, kde=True, data=df, color='lightgreen', edgecolor='black')
plt.title('Distribution of Release Years')
plt.show()

#### 5. IMDB Score Distribution

In [None]:
sns.histplot(x='imdb_score', bins=20, kde=True, data=df, color='skyblue', edgecolor='black')
plt.title('IMDB Score Distribution')
plt.xticks(rotation=45, ha='right')  
plt.tight_layout()  
plt.show()

#### 6. Top-10 Rated Titles

In [None]:
top_rated = df.nlargest(10, 'imdb_score') 

plt.figure(figsize=(10, 5))
sns.barplot(x='imdb_score', y='title', data=top_rated, palette='mako')
plt.title('Top-10 Rated Titles')
plt.show()

#### 7. IMDB Votes vs IMDB Score (with Release Year)

In [None]:
sns.scatterplot(x='imdb_votes', y='imdb_score',  hue='release_year', data=df, color='purple')
plt.title('IMDB Votes vs IMDB Score')
plt.show()

#### 8. IMDb Scores and Votes Across Release Years

In [None]:
sns.scatterplot(x='release_year', y='imdb_score', hue='type', size='imdb_votes', data=df, palette='Set2')
plt.title('IMDb Scores and Votes Across Release Years')
plt.show()

### Correlation and Pairwise Analysis:

#### 1. Pair Plot of Numeric Columns:

In [None]:
plt.figure(figsize=(12, 8))
sns.pairplot(df[['release_year', 'runtime', 'imdb_score', 'imdb_votes']], diag_kind='kde')
plt.suptitle('Pair Plot of Numeric Columns', y=1.02)
plt.show()

#### 2. Correlation Heatmap for Numeric Columns

In [None]:
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
correlation_matrix_numeric = df[numeric_columns].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_numeric, annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Heatmap for Numeric Columns')
plt.show()

### Text Analysis:

#### 1. Word Cloud of Most Popular Words in Description

In [None]:
# Combine all descriptions into a single string
text = ' '.join(df['description'].astype(str))

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Most Popular Words in Description')
plt.show()

#### 2. Sentiment Distribution in Descriptions

In [None]:
df['sentiment'] = df['description'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

plt.figure(figsize=(10, 6))
sns.histplot(df['sentiment'], bins=20, kde=True, color='green', edgecolor='black')
plt.title('Sentiment Distribution in Descriptions')
plt.show()

### Thanks for your attention