In [1]:
# Импорт библиотек
import sqlite3 
from sqlite3 import Error 
import pandas as pd
import re
import numpy as np

**Шаг 1**  
Прочитать файлы и создать над ними таблицы, где структуры таблиц:  

1. articles в виде:  
 |-- id: integer (nullable = true)  
 |-- text: string (nullable = true) 
 
 
2. stopwords в виде:  
 |-- word: string (nullable = true)  

До выполнения задачи изначально обработать данные:  
- при парсинге отбросить все символы, которые не являются латинскими буквами;  
- привести все слова к нижнему регистру;  
- удалить все стоп-слова из articles с помощью таблицы stopwords.  

In [2]:
# Функция создания Connection с БД
def create_connection(path):
    connection = None
    try:
        connection = sqlite3.connect(path)
        print("Connection to SQLite DB successful")
    except Error as e:
        print(f"The error '{e}' occurred")
    return connection

In [3]:
# Создание Connection с БД. Неявно создает БД, если не существует.
con = create_connection("HWd.db")

Connection to SQLite DB successful


In [4]:
# Функция выполнения запроса к БД. Создание Cursor, выполнение запроса/ошибка, если запрос выполнен, то 
# коммит Connection, закрытие Cursor.
def execute_query(connection, query):
    cursor = connection.cursor()
    try:
        cursor.execute(query)
        connection.commit()
        print("Query executed successfully")
        cursor.close()
    except Error as e:
        print(f"The error '{e}' occurred")

In [5]:
# Запрос на создание таблицы articles в БД.
create_article_table = """
CREATE TABLE IF NOT EXISTS articles (
  id INTEGER,
  text TEXT
);
"""

In [6]:
# Исполнение запроса на создание таблицы articles в БД.
execute_query(con, create_article_table)

Query executed successfully


In [7]:
# Запрос на создание таблицы stopwords в БД.
create_stopwords_table = """
CREATE TABLE IF NOT EXISTS stopwords (
  word TEXT
);
"""

In [8]:
# Исполнение запроса на создание таблицы stopwords в БД.
execute_query(con, create_stopwords_table)

Query executed successfully


In [9]:
# Чтение файла csv в датафрейм.
df_articles = pd.read_csv('articles.csv', sep=';', header=None, names=['id', 'text'])
df_articles

Unnamed: 0,id,text
0,1,Bradley Charles Cooper born January 5 1975 is ...
1,2,Cooper enrolled in the MFA program at the Acto...
2,3,Cooper found greater success with the romantic...
3,4,Labeled a sex symbol by the media Cooper was n...
4,5,Cooper was born on January 5 1975 in Abingto...
...,...,...
355,356,"As he had in 1940, Roosevelt won re-election w..."
356,357,This is the last election in which New Hampshi...
357,358,one of three elections since 1896 which the ev...
358,359,"The 432 electoral votes received by Roosevelt,..."


In [10]:
# Определяем регулярное выражение для удаления всех символов, не являющихся латинскими буквами, но оставляющее пробелы.
pattern = re.compile('[^a-zA-Z\s]')
# Применяем регулярное выражение к столбцу "text"
df_articles['text'] = df_articles['text'].apply(lambda x: pattern.sub('', x))

In [11]:
df_articles

Unnamed: 0,id,text
0,1,Bradley Charles Cooper born January is an Am...
1,2,Cooper enrolled in the MFA program at the Acto...
2,3,Cooper found greater success with the romantic...
3,4,Labeled a sex symbol by the media Cooper was n...
4,5,Cooper was born on January in Abington Tow...
...,...,...
355,356,As he had in Roosevelt won reelection with a ...
356,357,This is the last election in which New Hampshi...
357,358,one of three elections since which the eventu...
358,359,The electoral votes received by Roosevelt add...


In [12]:
# Удаляем повторяющиеся пробелы в каждой строке.
df_articles['text'] = df_articles['text'].apply(lambda x: re.sub('\s+', ' ', x))

In [13]:
# Приводим слова к нижнему регистру.
df_articles['text'] = df_articles['text'].apply(lambda x: x.lower())

In [14]:
# Чтение файла csv в датафрейм.
df_stop = pd.read_csv('stopwords.csv', header=None, names=['word'])
df_stop

Unnamed: 0,word
0,x
1,y
2,your
3,yours
4,yourself
...,...
728,excepts
729,except
730,excepting
731,exes


In [15]:
# Создаем список стоп-слов
stopwords = list(df_stop['word'])

# Определяем функцию, которую будем использовать в методе apply
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stopwords]
    return ' '.join(filtered_words)

# Применяем функцию remove_stopwords к столбцу "text"
df_articles['text'] = df_articles['text'].apply(lambda x: remove_stopwords(x))

In [16]:
df_articles['text'][:5].values

array(['bradley charles cooper born january american actor filmmaker recipient accolades including british academy film award grammy awards addition nominations academy awards golden globe awards tony award cooper forbes celebrity times times list influential people world films grossed billion worldwide placed times annual rankings worlds highestpaid actors',
       'cooper enrolled mfa program actors studio beginning career guest role television series sex city made film debut comedy wet hot american summer gained recognition will tippin spyaction television show alias role show demoted began career doubts gained recognition supporting part comedy film wedding crashers breakthrough hangover critically commercially successful comedy spawned sequels coopers portrayal struggling writer thriller limitless rookie police officer crime drama place pines drew praise critics',
       'cooper found greater success romantic comedy silver linings playbook black comedy american hustle war biopic a

In [17]:
# Добавление всех записей с файла articles.csv в таблицу articles
df_articles.to_sql('articles', con, if_exists='append', index=False)

In [18]:
# Добавление всех записей с файла stopwords.csv в таблицу stopwords
df_stop.to_sql('stopwords', con, if_exists='append', index=False)

**Шаг 2**  
Извлечь коллокации в тексте articles.csv. Это комбинации слов, которые часто встречаются вместе. Например, «smart boss» или «linings playbook». Чтобы найти совпадения, нужно использовать метрику NPMI (нормализованная точечная взаимная информация).  
Таким образом, для каждой комбинации слов *ab* посчитать NPMI и вывести на экран TOP-50 самых популярных коллокаций, отсортированных по убыванию значения NPMI.
Комбинацию слов *ab* объединить пробелом.

In [19]:
# Создаем датафрейм с отдельными словами.
words_df = pd.DataFrame(df_articles['text'].str.split().tolist(), index=df_articles.index).stack()
words_df = words_df.reset_index(level=1, drop=True).reset_index(name='words')

In [20]:
words_df

Unnamed: 0,index,words
0,0,bradley
1,0,charles
2,0,cooper
3,0,born
4,0,january
...,...,...
17496,359,archives
17497,359,records
17498,359,administration
17499,359,retrieved


In [21]:
# Запрос на создание временной таблицы.
create_temp_table = """CREATE TEMPORARY TABLE words_temp (word TEXT)"""

In [22]:
# Исполнение запроса на создание временной таблицы words_temp.
execute_query(con, create_temp_table)

Query executed successfully


In [23]:
# Добавляем слова во временную таблицу.
cursor = con.cursor()
try:
    for index, row in words_df.iterrows():
        cursor.execute('INSERT INTO words_temp VALUES (?)', (row['words'],))
    con.commit()
    print("Query executed successfully")
    cursor.close()
except Error as e:
    print(f"The error '{e}' occurred")

Query executed successfully


In [24]:
# Читаем данные из временной таблицы с группировкой по кол-ву повторений.
words_count = pd.read_sql_query('SELECT word, COUNT(word) AS words_count FROM words_temp GROUP BY word', con)
words_count

Unnamed: 0,word,words_count
0,aaron,1
1,ab,1
2,ababu,1
3,abandon,1
4,abandoned,2
...,...,...
6036,zellweger,1
6037,ziryab,1
6038,ziyad,2
6039,zoe,1


In [25]:
# Запрос для формирования коллокаций и подсчета их кол-ва.
query = """
SELECT bigram, COUNT(bigram) AS COUNT_COL
FROM(
SELECT word || ' ' || COLLOCATION as bigram
FROM (
SELECT word, LEAD(word) OVER() AS 'COLLOCATION'
FROM words_temp)
)
GROUP BY bigram
HAVING COUNT_COL > 0
;
"""

In [26]:
# Выполнение запроса для шага 2 и запись в датафрейм.
coll_count = pd.read_sql_query(query, con)
coll_count

Unnamed: 0,bigram,COUNT_COL
0,aaron jordan,1
1,ab omnibus,1
2,ababu season,1
3,abandon wallace,1
4,abandoned civil,1
...,...,...
15660,ziryab major,1
15661,ziyad crossed,1
15662,ziyad earlier,1
15663,zoe saldana,1


In [27]:
# Разделение биграмм на отдельные слова
coll_count[['word1', 'word2']] = coll_count['bigram'].str.split(' ', expand=True)
coll_count

Unnamed: 0,bigram,COUNT_COL,word1,word2
0,aaron jordan,1,aaron,jordan
1,ab omnibus,1,ab,omnibus
2,ababu season,1,ababu,season
3,abandon wallace,1,abandon,wallace
4,abandoned civil,1,abandoned,civil
...,...,...,...,...
15660,ziryab major,1,ziryab,major
15661,ziyad crossed,1,ziyad,crossed
15662,ziyad earlier,1,ziyad,earlier
15663,zoe saldana,1,zoe,saldana


In [28]:
# Объединение таблиц с отдельными словами и с их кол-вом и коллокациями с их кол-вом.
merged_df = coll_count.merge(words_count, left_on='word1', right_on='word', how='left')
merged_df = merged_df.merge(words_count, left_on='word2', right_on='word', how='left', suffixes=['_1', '_2'])
merged_df.drop(['word_1', 'word_2'], axis=1, inplace=True)

In [29]:
merged_df

Unnamed: 0,bigram,COUNT_COL,word1,word2,words_count_1,words_count_2
0,aaron jordan,1,aaron,jordan,1,60
1,ab omnibus,1,ab,omnibus,1,1
2,ababu season,1,ababu,season,1,40
3,abandon wallace,1,abandon,wallace,1,7
4,abandoned civil,1,abandoned,civil,2,2
...,...,...,...,...,...,...
15660,ziryab major,1,ziryab,major,1,14
15661,ziyad crossed,1,ziyad,crossed,2,1
15662,ziyad earlier,1,ziyad,earlier,2,6
15663,zoe saldana,1,zoe,saldana,1,1


In [30]:
# Кол-во слов
n_words = len(words_df)
n_words

17501

In [31]:
# Вычисляем метрики и заносим в датафрейм.
df = pd.DataFrame(columns=['bigram', 'P(a)', 'P(b)', 'P(ab)', 'PMI', 'NPMI'])

for _, row in merged_df.iterrows():
    # Получаем частоту биграмма
    freq_ab = row['COUNT_COL']

    # Получаем частоту слова 1
    freq_a = row['words_count_1']

    # Получаем частоту слова 2
    freq_b = row['words_count_2']

    # Вычисляем вероятности слов и биграмма
    p_a = freq_a / n_words
    p_b = freq_b / n_words
    p_ab = freq_ab / (n_words - 1)

    # Вычисляем PMI
    pmi = np.log(p_ab / (p_a * p_b))
    
    # Вычисляем NPMI
    npmi = -(pmi/np.log(p_ab))
    
    df.loc[len(df)] = [row['bigram'], p_a, p_b, p_ab, pmi, npmi]
df

Unnamed: 0,bigram,P(a),P(b),P(ab),PMI,NPMI
0,aaron jordan,0.000057,0.003428,0.000057,5.675726,0.580937
1,ab omnibus,0.000057,0.000057,0.000057,9.770070,1.000012
2,ababu season,0.000057,0.002286,0.000057,6.081191,0.622438
3,abandon wallace,0.000057,0.000400,0.000057,7.824160,0.800839
4,abandoned civil,0.000114,0.000114,0.000057,8.383776,0.858118
...,...,...,...,...,...,...
15660,ziryab major,0.000057,0.000800,0.000057,7.131013,0.729892
15661,ziyad crossed,0.000114,0.000057,0.000057,9.076923,0.929065
15662,ziyad earlier,0.000114,0.000343,0.000057,7.285164,0.745670
15663,zoe saldana,0.000057,0.000057,0.000057,9.770070,1.000012


In [32]:
# Сортируем по NPMI по убыванию и выводим 50 первых строк.
df[['bigram', 'NPMI']].sort_values(by='NPMI', ascending=False)[:50]

Unnamed: 0,bigram,NPMI
5541,fish fry,1.000014
8086,linings playbook,1.000014
9410,nightmare alley,1.000014
8243,los angeles,1.000014
6211,guardians galaxy,1.000014
7990,licorice pizza,1.000013
15317,willy wanker,1.000013
1314,barack obama,1.000013
6792,iberian peninsula,1.000013
6387,hazardous equipment,1.000013


In [33]:
# Закрытие Connection с БД.
con.close()