In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')

from matplotlib.pyplot import figure
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (20, 20)  # (12,8)

pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

In [2]:

# all info with articles from the 2019.10.01 up to 2020.04.21
list_of_paths = [
    'data/oct_2019_1102_1001.pickle',
    'data/nov_2019_1202_1101.pickle',
    'data/dec_2020_0102_1201.pickle',
    'data/jan_2020_0202_0101.pickle',
    'data/feb_2020_0302_0201.pickle',
    'data/mar_2020_0401_0301.pickle',
    'data/apr_2020_0421_0330.pickle',
]

# The date format '2019_1102_1001' means:
# 2019 is a year, 1102 is 11th month and 2 day, 1001 is 10th month and 1 day
# So 1102 was the top date, 1001 was the bottom date,
# the info was saved from the top date to the bottom date.
# So everything from 10.01 up to 11.02 was saved as 1 october file.

In [3]:
# with open('data/apr_2020_0421_0330.pickle', 'rb') as f:
#     data = pickle.load(f)
# df = pd.DataFrame(data)

list_of_df = []
for i, path in enumerate(list_of_paths):
    with open(path, 'rb') as f:
        data = pickle.load(f)
    list_of_df.append(pd.DataFrame(data))

df = pd.concat(list_of_df, ignore_index=True)
# df.head(100).to_csv('df_head_100.csv')

# print(df.head())
print(df)

timestamp                                              title  \
0      1572641980  Устроивший взрывы в двух мечетях на севере Лив...   
1      1572641796  Лавров: США сами доказали, что доллар - ненаде...   
2      1572641774  Футбольный матч "Торпедо" - "СКА-Хабаровск" пе...   
3      1572641660  Лавров: Россия никогда не прекратит бороться з...   
4      1572641500  Лавров: Россия и Китай не будут заключать воен...   
...           ...                                                ...   
71395  1585516117  Медведев: покидать квартиру или дачу в нерабоч...   
71396  1585516064  Медведев не исключил принятия в России более ж...   
71397  1585515997  Автоконцерны приостанавливают производство маш...   
71398  1585515789  Неделя в России будет нерабочей из-за коронави...   
71399  1585515738  Самолет со сработавшим датчиком отказа двигате...   

                 category                     href    date_ymd  date_hms  \
0                Общество      /obschestvo/7073248  2019-11-01  23:

In [4]:
print(df.shape)
print(df.dtypes)

df_numeric = df.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values
print(numeric_cols)

df_non_numeric = df.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values
print(non_numeric_cols)

(71400, 9)
timestamp            int64
title               object
category            object
href                object
date_ymd            object
date_hms            object
date_full           object
is_breaking_news      bool
article_text        object
dtype: object
['timestamp']
['title' 'category' 'href' 'date_ymd' 'date_hms' 'date_full'
 'is_breaking_news' 'article_text']


In [5]:
# would not recommend to show off that large df, check the 'output_images/' folder instead

# cols = df.columns
# colors = ['#b00b69', '#9e9e9e']
# sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colors))

In [6]:
# get a df with only null articles

df_of_only_null_articles = df.loc[pd.isnull(df).any(1),:]

print(df_of_only_null_articles.shape)


(352, 9)


In [7]:
# delete all items without an article
print(df.shape)
df = df[df.article_text.notnull()]
print(df.shape)

(71400, 9)
(71048, 9)


In [8]:
# delete all duplicates
df = df.loc[~df.timestamp.duplicated(keep='first')]
print(df.shape)

(68873, 9)


In [9]:
# show all unique days
column_values = df[["date_ymd"]].values.ravel()
unique_values =  pd.unique(column_values)

# print(unique_values) 

# btw there are 203 days:
print(unique_values.shape)

(203,)


In [10]:
# for each unique day print the number of pieces of news

df_of_articles_per_unique_day = df.groupby('date_ymd')['timestamp'].nunique()

print(df_of_articles_per_unique_day)
print()

# min
print(df_of_articles_per_unique_day.min())
print(
    df_of_articles_per_unique_day[
        df_of_articles_per_unique_day == df_of_articles_per_unique_day.min()
    ]
)

# max
print(df_of_articles_per_unique_day.max())
print(
    df_of_articles_per_unique_day[
        df_of_articles_per_unique_day == df_of_articles_per_unique_day.max()
    ]
)

date_ymd
2019-10-01    437
2019-10-02    410
2019-10-03    463
2019-10-04    395
2019-10-05    159
             ... 
2020-04-16    405
2020-04-17    429
2020-04-18    170
2020-04-19    138
2020-04-20    330
Name: timestamp, Length: 203, dtype: int64

110
date_ymd
2019-11-24    110
Name: timestamp, dtype: int64
577
date_ymd
2020-03-12    577
Name: timestamp, dtype: int64


In [11]:
# show the average number of articles per day

print(df_of_articles_per_unique_day.mean())
print(int(df_of_articles_per_unique_day.mean()))


339.2758620689655
339


In [12]:
# save all data to csv

# df.to_csv('data.csv')
df.to_csv('data/cleared_data.csv')