In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from collections import defaultdict
from scipy.stats import skew 
from tqdm import tqdm # Visualization of loop progress
import matplotlib.pyplot as plt

# EDA Purposes:
- How many documents/sentences/tokens are there in your input corpus? 
- What are the length distributions of documents and sentences? Any outliers?
- What is the distribution of all tokens? How many "rare" tokens (e.g., < 5 times)? 
- Is there any pre-processing required? e.g., remove the consecutive whitespace, remove some "weird" characters.
- Run AutoPhrase, and then plot the quality score distribution of single-word and multi-word phrases separately. Compare and discuss their differences.

## How many documents/sentences/tokens are there in the input corpus? 

### Counts of documents/sentences/tokens

In [None]:
pd.read_csv('../data/eda_files/count_stats.csv', index_col=0)

## What are the length distributions of documents and sentences? Any outliers?
- **Length distribution of documents**

In [None]:
doc_df = pd.read_csv('../data/eda_files/all_doc.csv')

In [None]:
fig = plt.gcf()
fig.set_size_inches(15, 5)
doc_length = [len(doc.split()) for doc in doc_df['Document']]
sns.distplot(doc_length)

In [None]:
fig = plt.gcf()
fig.set_size_inches(15, 5)
sns.boxplot(doc_length)

In [None]:
pd.read_csv('../data/eda_files/doc_stats.csv', index_col=0)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})

- **Length distribution of sentences**

In [None]:
sent_df = pd.read_csv('../data/eda_files/all_sent.csv')

In [None]:
sent_length = [len(sent.split()) for sent in sent_df['Sentence']]
sns.distplot(sent_length)

In [None]:
sns.boxplot(sent_length)

In [None]:
pd.read_csv('../data/eda_files/sent_stats.csv', index_col=0)

## What is the distribution of all tokens? How many "rare" tokens (e.g., < 5 times)?

In [None]:
all_token = pd.read_csv('../data/eda_files/all_token_counts.csv')

In [None]:
sns.distplot([i for i in all_token['Count'] if i >= 5])

In [None]:
pd.read_csv('../data/eda_files/token_stats.csv', index_col=0)

## Run AutoPhrase, and then plot the quality score distribution of single-word and multi-word phrases separately. Compare and discuss their differences.

In [None]:
multi_word_scores = pd.read_csv('../data/eda_files/multi_score.csv', index_col=0)
sns.distplot(multi_word_scores['Scores'])

In [None]:
single_word_scores = pd.read_csv('../data/eda_files/single_score.csv', index_col=0)
sns.distplot(single_word_scores['Scores'])

In [None]:
fig = plt.figure(figsize=(11.7,8.27))
sns.distplot(multi_word_scores['Scores'], color="skyblue", kde = False, label = 'Multi Words')
sns.distplot(single_word_scores['Scores'], color="olive", kde = False, label = 'Single Word')
fig.legend(labels=['Multi Words', 'Single Word'])