# Dataset analysis

Looks through the data available in the MongoDB instance and show several stats used to narrow down the input data to the models

In [None]:
import pandas as pd
import plotly.express as px
from pymongo import MongoClient


In [None]:
client = MongoClient('127.0.0.1', 27017)
db = client.frtp
collection = db.documents


In [None]:
data = pd.DataFrame(list(collection.find()))


### A preview of the data stored in the DB

In [None]:
data = data.iloc[2:]
data[:20]


Keep only the stats for each entry

Reformat year for display purposes

In [None]:
stats_data = data[['ticker', 'year', 'start_index', 'end_index', 'size']]
stats_data['year'] = '20'+stats_data['year']


In [None]:
stats_data.count().head()


In [None]:
stats_data = stats_data.groupby(by=['year']).count().reset_index()


In [None]:
fig = px.bar(stats_data, x='year', y='ticker', title="Distribution of documents based on the year",
             labels={
                 'year': 'Submission year',
                 'ticker': 'No companies reporting'
             })
fig.show()


In [None]:
fig = px.histogram(data, x="size", nbins=50, labels={'size': 'Length of extracted Risk Factors section'},
                   title="Distribution of size of the Risk Factors Section")
fig.show()


In [None]:
stats_data = data[['year', 'size']]
stats_data['year'] = '20'+stats_data['year']


In [None]:
stats_data = stats_data.groupby(by='year').mean().reset_index()


In [None]:
fig = px.line(stats_data, x="year", y="size", title='Evolution of Risk Factors Section size during the timeframe')
fig.show()
