In [1]:
# lens of tables
SMALL_SZ = 1_000_000
BIG_SZ = 100_000_000   # for ticket
BLOCK_SZ = 10_000_000  # for ticket

In [2]:
import numpy as np
import pandas as pd
import json
import csv
from mimesis import Generic

In [4]:
def fmt(s):
    return "{{{}}}".format(", ".join(map(str,s)))

In [3]:
g = Generic('en')

### customer

In [80]:
def customer_info():
    info = {'name': g.person.full_name(), 
            'contacts': {'email': g.person.email(), 
                         'number': g.person.telephone()}, 
            'age': g.person.age(), 
            'nationality': g.person.nationality()}
    return json.dumps(info)

In [81]:
%%time
c_customer_infos = [customer_info() for _ in range(SMALL_SZ)]

customer = pd.DataFrame({
    'customer_info': c_customer_infos
})

customer.to_csv('../data/customer.csv', header=False, 
                index=False, sep='\t', quoting=csv.QUOTE_NONE)

CPU times: user 32.7 s, sys: 431 ms, total: 33.1 s
Wall time: 33.2 s


### ticket

In [75]:
%%time
t_customer_ids = np.random.randint(low=1, high=SMALL_SZ+1, size=BLOCK_SZ)
t_fest_ids = np.random.randint(low=1, high=SMALL_SZ+1, size=BLOCK_SZ)
t_price_ids = np.random.randint(low=1, high=11, size=BLOCK_SZ)

ticket = pd.DataFrame({
    'customer_id': t_customer_ids,
    'fest_id': t_fest_ids,
    'price_id': t_price_ids
})

ticket.to_csv('../data/ticket10.csv', header=False, 
                index=False, sep='\t', quoting=csv.QUOTE_NONE)
# and then the same for ticket2.csv, ticket3.csv, ..., ticket10.csv

CPU times: user 7.65 s, sys: 184 ms, total: 7.83 s
Wall time: 7.84 s


### fest

In [89]:
def ratings():
    r = np.random.randint(low=50, high=1500, size=10)
    return fmt(r)


def prices():
    p = np.sort(np.random.randint(low=1, high=21, size=10)) * 10
    return fmt(p)


def n_tickets():
    t = np.random.randint(low=40, high=1400, size=10)
    return fmt(t)

In [90]:
%%time
f_ratings = [ratings() for _ in range(SMALL_SZ)]
f_prices = [prices() for _ in range(SMALL_SZ)]
f_n_tickets = [n_tickets() for _ in range(SMALL_SZ)]

fest = pd.DataFrame({
    'ratings': f_ratings,
    'prices': f_prices,
    'n_tickets': f_n_tickets
})

fest.to_csv('../data/fest.csv', header=False, 
                index=False, sep='\t', quoting=csv.QUOTE_NONE)

CPU times: user 1min 6s, sys: 720 ms, total: 1min 6s
Wall time: 1min 9s


### genre

In [7]:
def total_ratings():
    tr = np.random.randint(low=1_000, high=15_000_000, size=10)
    return fmt(tr)

In [15]:
g_genre_names = ['hard rock', 'industrial metal', 'heavy metal', 
          'gothic metal', 'gothic rock', 'glam metal', 
          'glam rock', 'industrial rock', 'arena rock', 
          'pop rock', 'NDH', 'EBM', 'alternative metal', 
          'progressive metal', 'metalcore', 'rap rock', 
          'rock', 'trash metal', 'industrial techno', 
          'shock rock', 'nu metal', 'art rock', 'space rock', 
          'synth rock', 'dance rock', 'electronic rock', 
          'electro-industrial', 'symphonic metal', 
          'indie rock', 'garage rock', 'post-punk', 
          'blues rock', 'rock and roll', 'funk rock', 
          'funk metal', 'experimental', 'alternative rock', 
          'neoclassic metal']
G_SZ = len(g_genre_names)

In [16]:
%%time
g_ratings = [total_ratings() for _ in range(G_SZ)]
g_total_revenues = np.random.randint(low=20_000, high=100_000_000, size=G_SZ)
g_total_tickets = np.random.randint(low=8_000, high=300_000, size=G_SZ)

genre = pd.DataFrame({
    'genre_name': g_genre_names,
    'ratings': g_ratings,
    'total_revenue': g_total_revenues,
    'total_tickets': g_total_tickets
})

genre.to_csv('../data/genre.csv', header=False, 
                index=False, sep='\t', quoting=csv.QUOTE_NONE)

CPU times: user 1.17 ms, sys: 2.97 ms, total: 4.14 ms
Wall time: 3.3 ms


### genre_fest

In [20]:
MED_SZ = 4_000_000

In [21]:
%%time
gf_genre_ids = np.random.randint(low=1, high=G_SZ+1, size=MED_SZ)
gf_fest_ids = np.random.randint(low=1, high=SMALL_SZ+1, size=MED_SZ)

genre_fest = pd.DataFrame({
    'genre_id': gf_genre_ids,
    'fest_id': gf_fest_ids
})

genre_fest.to_csv('../data/genre_fest.csv', header=False, 
                index=False, sep='\t', quoting=csv.QUOTE_NONE)

CPU times: user 2.03 s, sys: 39.8 ms, total: 2.07 s
Wall time: 2.07 s


### rewiew

In [88]:
%%time
r_customer_ids = np.random.randint(low=1, high=SMALL_SZ+1, size=SMALL_SZ)
r_fest_ids = np.random.randint(low=1, high=SMALL_SZ+1, size=SMALL_SZ)
r_ratings = np.random.randint(low=1, high=11, size=SMALL_SZ)
r_contents = [g.text.text(2) for _ in range(SMALL_SZ)]

rewiew = pd.DataFrame({
    'customer_id': r_customer_ids,
    'fest_id': r_fest_ids,
    'rating': r_ratings,
    'content': r_contents
})

rewiew.to_csv('../data/rewiew.csv', header=False, 
                index=False, sep='\t', quoting=csv.QUOTE_NONE)

CPU times: user 5.02 s, sys: 251 ms, total: 5.27 s
Wall time: 5.48 s


Какую статистику и зачем собираем?

Анализируем ... в зависимости от ...

1) Популярность жанров/групп -страны -возраста
2) Цены билетов
3) Кореляция цены/оценки/отзывов
4) В какой стране самые популярные/больше всего фестивалей
5) Средняя оценка/МО/std/...