In [14]:
import logging
import os

import sqlite3
import pandas as pd

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    handlers=[
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)
logger.setLevel('INFO')

db_file = 'data/messages.db'

conn = sqlite3.connect(db_file, check_same_thread=True)

def run_sql(sql_str, db_con=conn, cols = None):
    with db_con as con:
        res = pd.DataFrame(con.execute(sql_str).fetchall(), columns=cols)
    return res
        
sql_str = """
    SELECT 
        name
    FROM 
        sqlite_master
    WHERE 
        name NOT LIKE 'sqlite_%';
"""
run_sql(sql_str, cols=['table_name'])


Unnamed: 0,table_name
0,tg_messages


In [15]:
TABLE_NAME = 'tg_messages'

table_df = pd.read_sql_query(f"SELECT * from {TABLE_NAME} LIMIT 10", conn)

table_df.head()

Unnamed: 0,id,msg,channel,msg_hash
0,2248,управляющая компания предлагает в аренду 1 одн...,rentinlimassol,314c320246f78c0db8ddb8489072f7ab
1,2246,управляющая компания предлагает в аренду целое...,rentinlimassol,6f3d312ef2c7f712ba0f761556492df0
2,2238,управляющая компания предлагает в аренду роско...,rentinlimassol,cb3cb9c0da0195b069392b849003d84f
3,2232,управляющая компания предлагает в аренду совре...,rentinlimassol,3f29b9e54210ea6c4ea3cafb27d04a9b
4,2224,управляющая компания предлагает в аренду роско...,rentinlimassol,2c697381b6caac91e7b684fa34b86025


In [16]:
sql_str = f"""
    SELECT
        COUNT(*) as num_messages,
        CAST(AVG(length(msg))  as integer) as avg_length,
        COUNT(DISTINCT channel) num_channels
    FROM {TABLE_NAME}
    LIMIT 10
"""

pd.read_sql_query(sql_str, conn)

Unnamed: 0,num_messages,avg_length,num_channels
0,27037,324,6


In [17]:
from jinja2 import Template

def get_neg_samples_df():
    irrelevant_msg_ids = [
        153375, 130177, 152303, 156005, 152225, 152209, 152159, 152129,
        152831, 152766, 152740, 152697, 129161, 129139, 152628, 152556
    ]

    sql_str = Template(
        """
        SELECT 
            id, msg
        FROM {{ table }}
        WHERE id IN (
            {%- for msg_id in msg_ids -%} {{msg_id}} {{"," if not loop.last }} {% endfor %}
        )
        """
    ).render(msg_ids=irrelevant_msg_ids, table=TABLE_NAME)

    neg_samples_df = pd.read_sql_query(sql_str, conn)
    num_neg_samples = neg_samples_df.shape[0]

    logger.info('num rows: %d', num_neg_samples)
    
    return neg_samples_df

neg_samples_df = get_neg_samples_df()
neg_samples_df.head()

2023-01-08 23:31:18,648 num rows: 16


Unnamed: 0,id,msg
0,129161,продажа (собственник) лимассол 580 000 евро ...
1,129139,аренда город ларнака . район декелия . дом три...
2,152740,"продажа продается новая квартира за €310,000 ..."
3,152129,**mandarin park: новое высотное здание в лимас...
4,152159,**как получить визитерскую визу на кипре? инст...


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer


def get_train_set(limit = -1):
    negatives_df = get_neg_samples_df()
    if limit < 0:
        num_negatives = negatives_df.shape[0]
        num_positives = int( num_negatives / 0.1)
        limit = num_positives + num_negatives
    else:
        limit = 8*10**3
    irrelevant_msg_ids = negatives_df['id'].values.tolist()
    sql_str = Template(
        """
        SELECT 
            msg,
            length(msg) as len_msg,
            CASE
                WHEN id IN (
                        {%- for msg_id in msg_ids -%} {{msg_id}} {{"," if not loop.last }} {% endfor %}
                    )
                THEN 1
                ELSE 0
            END target
        FROM {{ table }}
        ORDER BY target DESC
        LIMIT {{ limit }}
        """
    ).render(msg_ids=irrelevant_msg_ids, table=TABLE_NAME, limit=limit)

    corpus_df = pd.read_sql_query(sql_str, conn)
    
    return corpus_df

class Pandas2CSR:
    def __init__(self):
        self.vectorizer = None
        self.txt_col = None
        self.anchor_elements = None
    
    def df_to_matrix(self, input_series):
        res = input_series.values.reshape(-1).tolist()
        
        return res
    
    def fit(self, input_df, text_column='msg'):
        csr_matrix_dataset = self.df_to_matrix(input_df[text_column])
        self.txt_col = text_column
        
        logger.info('num rows: %d', len(csr_matrix_dataset))

        self.vectorizer = TfidfVectorizer()
        X = self.vectorizer.fit_transform(csr_matrix_dataset)
        logger.info('sparse matrix %s', X.shape)
        
        return X
    
    def transform(self, input_df):
        corpus = self.df_to_matrix(input_df[self.txt_col])
        X = self.vectorizer.transform(corpus)
        
        logger.info('result matrix %s', X.shape)
        
        return X
    
    def generate_features(self, neg_samples_df):
        # сохраняем якорные элементы
        if self.anchor_elements is None:
            self.anchor_elements = self.transform(neg_samples_df)
        anchor_elems

corpus_df = get_train_set()
pandas2csr = Pandas2CSR()
raw_matrix = pandas2csr.fit(corpus_df)

corpus_df.head()

2023-01-08 23:31:20,151 num rows: 16
2023-01-08 23:31:20,184 num rows: 176
2023-01-08 23:31:20,201 sparse matrix (176, 2390)


Unnamed: 0,msg,len_msg,target
0,продажа (собственник) лимассол 580 000 евро ...,2009,1
1,аренда город ларнака . район декелия . дом три...,140,1
2,"продажа продается новая квартира за €310,000 ...",588,1
3,**mandarin park: новое высотное здание в лимас...,197,1
4,**как получить визитерскую визу на кипре? инст...,99,1


In [19]:
from sklearn.metrics.pairwise import euclidean_distances

neg_samples_csr = pandas2csr.transform(neg_samples_df)

distances = euclidean_distances(raw_matrix, neg_samples_csr)
logger.info(distances.shape)

2023-01-08 23:31:20,943 result matrix (16, 2390)
2023-01-08 23:31:20,946 (176, 16)


In [20]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression().fit(distances, corpus_df['target'])



In [21]:
corpus_df = get_train_set(limit=8*10**3)
raw_matrix = pandas2csr.transform(corpus_df)
distances = euclidean_distances(raw_matrix, neg_samples_csr)
neg_example_proba = lr.predict_proba(distances)

2023-01-08 23:31:22,734 num rows: 16
2023-01-08 23:31:23,096 result matrix (8000, 2390)


In [22]:
corpus_df['dummy_label'] = neg_example_proba[:,1]

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.expand_frame_repr', False)

scored_corpus_df = (
    corpus_df.query("len_msg > 0")
    [['msg', 'dummy_label']]
    .sort_values(by='dummy_label', ascending=False)
)

scored_corpus_df.head(270)

Unnamed: 0,msg,dummy_label
3,**mandarin park: новое высотное здание в лимассоле **в районе гермасойя возведут элитный небоскреб. [https://dom.com.cy/live/digest-63766](https://dom.com.cy/live/digest-63766/?utm_source=telegram),0.265801
6,**на кипре продлили налоговые льготы при реструктуризации необслуживаемых кредитов** до какого числа проблемные заемщики получили послабления? [https://dom.com.cy/live/digest-63770](https://dom.com.cy/live/digest-63770/?utm_source=telegram),0.262643
9,"**полуостров акамас: забытый рай на кипре ** особенности региона, плюсы и минусы покупки недвижимости. [https://dom.com.cy/live/lifehacks-63910](https://dom.com.cy/live/lifehacks-63910/?utm_source=telegram)",0.252452
3939,**район пейя в пафосе вскоре кардинально преобразится** в регионе реализуется масса важных инфраструктурных проектов. https://dom.com.cy/live/digest-57332/,0.218025
8,**подавляющее большинство зданий лимассола расположено в сейсмоопасной зоне** опасно ли это для жителей города? https://dom.com.cy/live/digest-63892/?utm_source=telegram,0.201085
13,"**в лимассоле завершилось строительство здания для absolute institute of technical education **стоимость проекта оценивается в 8,5 млн евро. https://dom.com.cy/live/digest-63948/?utm_source=telegram",0.192891
4732,"όταν παραπέμπετε οποιοδήποτε μέλος στην επένδυση sarwacapitals, δικαιούστε 5% σε κάθε κατάθεση που κάνετε από την downline σας. για το δεύτερο επίπεδο, θα λάβετε 3% προμήθεια συνεργατών και για το τρίτο επίπεδο θα λάβετε 1% προμήθεια θυγατρικών. μπορείτε να επανεπενδύσετε την προμήθεια σας ή να κάνετε ανάληψη ανά πάσα στιγμή.",0.189911
7648,0mgmgd1,0.189911
2970,фронтенд разработка,0.189911
4413,зубной? половой?,0.189911


In [23]:
scored_corpus_df.to_csv('data/scored_corpus.csv', index=False)
logger.info('%d lines saved', scored_corpus_df.shape[0])

2023-01-08 23:31:27,156 6542 lines saved


In [24]:
df = pd.read_csv('data/labeled_data_corpus.csv')

train_df = df[df['subset'] == 'train']
test_df = df[df['subset'] == 'test']
print(train_df.shape[0], train_df['label'].mean(), test_df.shape[0], test_df['label'].mean())

df.head()

5233 0.20045862793808522 1309 0.20091673032849502


Unnamed: 0,msg_id,msg,label,subset
0,0,"здравствуйте. ишу 2х спальную квартиру в лимассоле. желательно гермасойя. семья из 2х взрослых и 2х детей. без животных. на длительный срок, бюджет до 1000-1500 евро. предложения в лс.",0,train
1,1,#сниму комнату в лимассоле или недалеко от него. с начала августа. любые предложения в лс,0,train
2,2,мошенник риэлторским услугам.,0,train
3,3,"**sales** reg.1053 lic.489/e **stylish apartment with sea view kissonerga. paphos** •total area: 85 m2 + balcony •bedrooms: 2 •bathrooms: 1 **€ 120,000** we have a lot to offer ================ **продажа** reg.1053 lic.489/e **стильные апартаменты вид на море •••kissonerga. пафос. ** •общая площадь: 85м2 + балкон •спальни: 2 •ванные комнаты: 1 **€ 120 000 ****+35726935826**** директ telegram 24/7** у нас есть что вам предложить",0,train
4,4,"важно: [valerii korol](tg://user?id=193474890), если ты не бот и не спамер, пройди проверку, нажав на кнопку, где есть",0,train


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.01, 1.0, 0.01),
    'min_df': np.arange(1, 20, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    vectorizer = TfidfVectorizer(**param_values).fit(X_train)
    X_train_csr = vectorizer.transform(X_train)
    lr = LogisticRegression().fit(X_train_csr, y_train)
    # predict
    X_test_csr = vectorizer.transform(X_test)
    y_pred = lr.predict(X_test_csr)
    cur_score = f1_score(y_true, y_pred)
    if cur_score > best_score:
        best_score = cur_score
        best_params.update(param_values)
    if cnt % 250 == 0:
        logging.info(
            'iteration: %d of %d; %s; best_score= %.4f',
            cnt, num_iters, param_values, best_score
        )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score)

2023-01-08 10:46:46,876 iteration: 0 of 1881; {'max_df': 0.01, 'min_df': 1}; best_score= 0.6209
2023-01-08 10:49:04,406 iteration: 250 of 1881; {'max_df': 0.14, 'min_df': 4}; best_score= 0.8549
2023-01-08 10:51:26,067 iteration: 500 of 1881; {'max_df': 0.27, 'min_df': 7}; best_score= 0.8605
2023-01-08 10:53:48,101 iteration: 750 of 1881; {'max_df': 0.4, 'min_df': 10}; best_score= 0.8605
2023-01-08 10:56:05,337 iteration: 1000 of 1881; {'max_df': 0.53, 'min_df': 13}; best_score= 0.8605
2023-01-08 10:58:29,749 iteration: 1250 of 1881; {'max_df': 0.66, 'min_df': 16}; best_score= 0.8605
2023-01-08 11:01:04,276 iteration: 1500 of 1881; {'max_df': 0.79, 'min_df': 19}; best_score= 0.8605
2023-01-08 11:03:36,671 iteration: 1750 of 1881; {'max_df': 0.93, 'min_df': 3}; best_score= 0.8605


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 14} 0.8605108055009824


In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.01, 0.2, 0.01),
    'min_df': np.arange(1, 15, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    vectorizer = TfidfVectorizer(**param_values).fit(X_train)
    X_train_csr = vectorizer.transform(X_train)
    lr = LogisticRegression().fit(X_train_csr, y_train)
    #gbr = GradientBoostingRegressor(random_state=0).fit(X_train_csr, y_train)
    # predict
    X_test_csr = vectorizer.transform(X_test)
    y_pred = lr.predict(X_test_csr)
    cur_score = f1_score(y_true, y_pred)
    if cur_score > best_score:
        best_score = cur_score
        best_params.update(param_values)
    if cnt % 250 == 0:
        logging.info(
            'iteration: %d of %d; %s; best_score= %.4f',
            cnt, num_iters, param_values, best_score
        )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score)

2023-01-08 11:57:51,289 iteration: 0 of 266; {'max_df': 0.01, 'min_df': 1}; best_score= 0.6209
2023-01-08 12:06:01,913 iteration: 250 of 266; {'max_df': 0.18000000000000002, 'min_df': 13}; best_score= 0.8605


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 14} 0.8605108055009824


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.1, 0.2, 0.01),
    'min_df': np.arange(1, 15, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    vectorizer = TfidfVectorizer(**param_values).fit(X_train)
    X_train_csr = vectorizer.transform(X_train)
    lr = LogisticRegression().fit(X_train_csr, y_train)
    #gbr = GradientBoostingRegressor(random_state=0).fit(X_train_csr, y_train)
    # predict
    X_test_csr = vectorizer.transform(X_test)
    y_pred = lr.predict(X_test_csr)
    cur_score = f1_score(y_true, y_pred)
    if cur_score > best_score:
        best_score = cur_score
        best_params.update(param_values)
    if cnt % 250 == 0:
        logging.info(
            'iteration: %d of %d; %s; best_score= %.4f',
            cnt, num_iters, param_values, best_score
        )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score)

2023-01-08 13:13:28,172 iteration: 0 of 140; {'max_df': 0.1, 'min_df': 1}; best_score= 0.8247


best params: %s, best_score %.4f {'max_df': 0.15999999999999998, 'min_df': 14} 0.8605108055009824


In [34]:
df.reset_index(drop=True).rename(columns={'index': 'msg_id'}).head()


Unnamed: 0,msg_id,msg,label,subset
0,0,"здравствуйте. ишу 2х спальную квартиру в лимассоле. желательно гермасойя. семья из 2х взрослых и 2х детей. без животных. на длительный срок, бюджет до 1000-1500 евро. предложения в лс.",0,train
1,1,#сниму комнату в лимассоле или недалеко от него. с начала августа. любые предложения в лс,0,train
2,2,мошенник риэлторским услугам.,0,train
3,3,"**sales** reg.1053 lic.489/e **stylish apartment with sea view kissonerga. paphos** •total area: 85 m2 + balcony •bedrooms: 2 •bathrooms: 1 **€ 120,000** we have a lot to offer ================ **продажа** reg.1053 lic.489/e **стильные апартаменты вид на море •••kissonerga. пафос. ** •общая площадь: 85м2 + балкон •спальни: 2 •ванные комнаты: 1 **€ 120 000 ****+35726935826**** директ telegram 24/7** у нас есть что вам предложить",0,train
4,4,"важно: [valerii korol](tg://user?id=193474890), если ты не бот и не спамер, пройди проверку, нажав на кнопку, где есть",0,train


2023-01-08 15:05:28,109 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:06:22,518 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:07:18,158 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:08:14,006 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:09:09,305 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8638132295719844 0.1


In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.16, 0.17, 0.01),
    'min_df': np.arange(4, 5, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    for n_es in np.arange(450, 551, 25):
        vectorizer = TfidfVectorizer(**param_values).fit(X_train)
        X_train_csr = vectorizer.transform(X_train)
        #lr = LogisticRegression().fit(X_train_csr, y_train)
        gbr = GradientBoostingRegressor(random_state=1, n_estimators=n_es, learning_rate=best_l_r).fit(X_train_csr, y_train)
        # predict
        X_test_csr = vectorizer.transform(X_test)
        y_pred = gbr.predict(X_test_csr)
        y_pred = [0 if i < 0.5 else 1 for i in y_pred]
        cur_score = f1_score(y_true, y_pred)
        if cur_score > best_score:
            best_score = cur_score
            best_params.update(param_values)
            best_n_es = n_es
        if cnt % 250 == 0:
            logging.info(
                'iteration: %d of %d; %s; best_score= %.4f',
                cnt, num_iters, param_values, best_score
            )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score, best_n_es)

2023-01-08 15:14:44,102 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8599
2023-01-08 15:15:37,405 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8621
2023-01-08 15:16:33,400 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:17:32,534 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:18:34,396 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8638132295719844 500


In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.16, 0.17, 0.01),
    'min_df': np.arange(4, 5, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    for l_r in np.arange(0.05, 0.151, 0.025):
        vectorizer = TfidfVectorizer(**param_values).fit(X_train)
        X_train_csr = vectorizer.transform(X_train)
        #lr = LogisticRegression().fit(X_train_csr, y_train)
        gbr = GradientBoostingRegressor(random_state=1, n_estimators=best_n_es, learning_rate=l_r).fit(X_train_csr, y_train)
        # predict
        X_test_csr = vectorizer.transform(X_test)
        y_pred = gbr.predict(X_test_csr)
        y_pred = [0 if i < 0.5 else 1 for i in y_pred]
        cur_score = f1_score(y_true, y_pred)
        if cur_score > best_score:
            best_score = cur_score
            best_params.update(param_values)
            best_l_r = l_r
        if cnt % 250 == 0:
            logging.info(
                'iteration: %d of %d; %s; best_score= %.4f',
                cnt, num_iters, param_values, best_score
            )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score, best_l_r)

2023-01-08 15:41:32,400 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8527
2023-01-08 15:42:17,791 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8588
2023-01-08 15:43:02,766 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:43:47,482 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:44:32,101 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8638132295719844 0.10000000000000002


In [72]:
best_n_es = 500
best_l_r = 0.1

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.16, 0.17, 0.01),
    'min_df': np.arange(4, 5, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    for d in np.arange(4, 10, 1):
        vectorizer = TfidfVectorizer(**param_values).fit(X_train)
        X_train_csr = vectorizer.transform(X_train)
        #lr = LogisticRegression().fit(X_train_csr, y_train)
        gbr = GradientBoostingRegressor(random_state=1, n_estimators=best_n_es, learning_rate=best_l_r, max_depth=d).fit(X_train_csr, y_train)
        # predict
        X_test_csr = vectorizer.transform(X_test)
        y_pred = gbr.predict(X_test_csr)
        y_pred = [0 if i < 0.5 else 1 for i in y_pred]
        cur_score = f1_score(y_true, y_pred)
        if cur_score > best_score:
            best_score = cur_score
            best_params.update(param_values)
            best_d = d
        if cnt % 250 == 0:
            logging.info(
                'iteration: %d of %d; %s; best_score= %.4f',
                cnt, num_iters, param_values, best_score
            )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score, best_d)

2023-01-08 16:28:48,493 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 16:30:18,291 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8654
2023-01-08 16:32:06,421 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8654
2023-01-08 16:34:10,017 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8654
2023-01-08 16:36:32,718 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8654
2023-01-08 16:39:14,381 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8654


best params: %s, best_score %.4f {'max_df': 0.17, 'min_df': 4} 0.8675623800383877 8


In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.16, 0.17, 0.01),
    'min_df': np.arange(4, 5, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    for rs in np.arange(3, 10, 1):
        vectorizer = TfidfVectorizer(**param_values).fit(X_train)
        X_train_csr = vectorizer.transform(X_train)
        #lr = LogisticRegression().fit(X_train_csr, y_train)
        gbr = GradientBoostingRegressor(random_state=rs, n_estimators=best_n_es, max_depth=best_d).fit(X_train_csr, y_train)
        # predict
        X_test_csr = vectorizer.transform(X_test)
        y_pred = gbr.predict(X_test_csr)
        y_pred = [0 if i < 0.5 else 1 for i in y_pred]
        cur_score = f1_score(y_true, y_pred)
        if cur_score > best_score:
            best_score = cur_score
            best_params.update(param_values)
            best_rs = rs
        if cnt % 250 == 0:
            logging.info(
                'iteration: %d of %d; %s; best_score= %.4f',
                cnt, num_iters, param_values, best_score
            )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score, best_rs)

2023-01-08 17:41:15,839 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8627
2023-01-08 17:41:59,091 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8637
2023-01-08 17:42:42,143 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 17:43:25,148 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 17:44:08,252 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 17:44:50,581 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 17:45:32,869 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8680688336520077 5


In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.15, 0.18, 0.01),
    'min_df': np.arange(4, 5, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    for d in np.arange(8, 10, 1):
        vectorizer = TfidfVectorizer(**param_values).fit(X_train)
        X_train_csr = vectorizer.transform(X_train)
        #lr = LogisticRegression().fit(X_train_csr, y_train)
        gbr = GradientBoostingRegressor(random_state=best_rs, n_estimators=best_n_es, max_depth=d).fit(X_train_csr, y_train)
        # predict
        X_test_csr = vectorizer.transform(X_test)
        y_pred = gbr.predict(X_test_csr)
        y_pred = [0 if i < 0.5 else 1 for i in y_pred]
        cur_score = f1_score(y_true, y_pred)
        if cur_score > best_score:
            best_score = cur_score
            best_params.update(param_values)
            best_d = d
        if cnt % 250 == 0:
            logging.info(
                'iteration: %d of %d; %s; best_score= %.4f',
                cnt, num_iters, param_values, best_score
            )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score, best_d)

2023-01-08 18:49:37,428 iteration: 0 of 3; {'max_df': 0.15, 'min_df': 4}; best_score= 0.8423
2023-01-08 18:52:12,685 iteration: 0 of 3; {'max_df': 0.15, 'min_df': 4}; best_score= 0.8423


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8680688336520077 8


In [26]:
best_rs = 5
best_n_es = 500
best_d = 8

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.16, 0.17, 0.01),
    'min_df': np.arange(4, 5, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    for rs in np.arange(3, 10, 1):
        vectorizer = TfidfVectorizer(**param_values).fit(X_train)
        X_train_csr = vectorizer.transform(X_train)
        #lr = LogisticRegression().fit(X_train_csr, y_train)
        gbr = GradientBoostingRegressor(random_state=rs, n_estimators=best_n_es, max_depth=best_d).fit(X_train_csr, y_train)
        # predict
        X_test_csr = vectorizer.transform(X_test)
        y_pred = gbr.predict(X_test_csr)
        y_pred = [0 if i < 0.5 else 1 for i in y_pred]
        cur_score = f1_score(y_true, y_pred)
        if cur_score > best_score:
            best_score = cur_score
            best_params.update(param_values)
            best_rs = rs
        if cnt % 250 == 0:
            logging.info(
                'iteration: %d of %d; %s; best_score= %.4f',
                cnt, num_iters, param_values, best_score
            )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score, best_rs)

2023-01-08 23:35:47,604 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8627
2023-01-08 23:36:30,057 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8637
2023-01-08 23:37:12,723 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 23:37:55,237 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 23:38:37,996 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 23:39:20,386 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 23:40:03,293 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8680688336520077 5


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.16, 0.17, 0.01),
    'min_df': np.arange(4, 5, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    vectorizer = TfidfVectorizer(**param_values).fit(X_train)
    X_train_csr = vectorizer.transform(X_train)
    #lr = LogisticRegression().fit(X_train_csr, y_train)
    gbr = GradientBoostingRegressor(alpha=0.9, random_state=5, n_estimators=best_n_es, max_depth=best_d).fit(X_train_csr, y_train)
    # predict
    X_test_csr = vectorizer.transform(X_test)
    y_pred = gbr.predict(X_test_csr)
    y_pred = [0 if i < 0.5 else 1 for i in y_pred]
    cur_score = f1_score(y_true, y_pred)
    if cur_score > best_score:
        best_score = cur_score
        best_params.update(param_values)
        best_a = a
    if cnt % 250 == 0:
        logging.info(
            'iteration: %d of %d; %s; best_score= %.4f',
            cnt, num_iters, param_values, best_score
        )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score, best_a, best_rs, best_n_es, best_d)

2023-01-09 00:35:31,196 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8680688336520077 0.9 5 500 8


In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']


vectorizer = TfidfVectorizer(**param_values).fit(X_train)
X_train_csr = vectorizer.transform(X_train)
gbr = GradientBoostingRegressor(random_state=1, n_estimators=500, max_depth=8).fit(X_train_csr, y_train)
# predict
X_test_csr = vectorizer.transform(X_test)
y_pred = gbr.predict(X_test_csr)
y_pred = [0 if i < 0.5 else 1 for i in y_pred]
cur_score = f1_score(y_true, y_pred)

print('best params: %s, best_score %.4f', param_values, cur_score)

{'max_df': 0.16, 'min_df': 4}
best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8675623800383877


In [46]:
import pickle

model = GradientBoostingRegressor(random_state=1, n_estimators=best_n_es, max_depth=best_d).fit(X_train_csr, y_train)

pkl_filename = "model.pkl" 
with open(pkl_filename, 'wb') as file: 
    pickle.dump(model, file)

In [43]:
vectorizer = TfidfVectorizer(**param_values).fit(X_train)

pkl_filename = "vectorizer.pkl" 
with open(pkl_filename, 'wb') as file: 
    pickle.dump(vectorizer, file)

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

X_train_csr = vectorizer.transform(X_train)
gbr = model
# predict
X_test_csr = vectorizer.transform(X_test)
y_pred = gbr.predict(X_test_csr)
y_pred = [0 if i < 0.5 else 1 for i in y_pred]
cur_score = f1_score(y_true, y_pred)

print('best_score %.4f', cur_score)


best_score %.4f 0.8675623800383877


In [14]:
df.reset_index(drop=True).rename(columns={'index': 'msg_id'}).head()


Unnamed: 0,msg_id,msg,label,subset
0,0,"здравствуйте. ишу 2х спальную квартиру в лимассоле. желательно гермасойя. семья из 2х взрослых и 2х детей. без животных. на длительный срок, бюджет до 1000-1500 евро. предложения в лс.",0,train
1,1,#сниму комнату в лимассоле или недалеко от него. с начала августа. любые предложения в лс,0,train
2,2,мошенник риэлторским услугам.,0,train
3,3,"**sales** reg.1053 lic.489/e **stylish apartment with sea view kissonerga. paphos** •total area: 85 m2 + balcony •bedrooms: 2 •bathrooms: 1 **€ 120,000** we have a lot to offer ================ **продажа** reg.1053 lic.489/e **стильные апартаменты вид на море •••kissonerga. пафос. ** •общая площадь: 85м2 + балкон •спальни: 2 •ванные комнаты: 1 **€ 120 000 ****+35726935826**** директ telegram 24/7** у нас есть что вам предложить",0,train
4,4,"важно: [valerii korol](tg://user?id=193474890), если ты не бот и не спамер, пройди проверку, нажав на кнопку, где есть",0,train


In [26]:
df.reset_index(drop=True).rename(columns={'index': 'msg_id'}).to_csv('data/labeled_data_corpus2.csv', index=False)


In [27]:
df2 = pd.read_csv('data/labeled_data_corpus2.csv')
df2.head()

Unnamed: 0,msg,label
0,"здравствуйте. ишу 2х спальную квартиру в лимассоле. желательно гермасойя. семья из 2х взрослых и 2х детей. без животных. на длительный срок, бюджет до 1000-1500 евро. предложения в лс.",0
1,#сниму комнату в лимассоле или недалеко от него. с начала августа. любые предложения в лс,0
2,мошенник риэлторским услугам.,0
3,"**sales** reg.1053 lic.489/e **stylish apartment with sea view kissonerga. paphos** •total area: 85 m2 + balcony •bedrooms: 2 •bathrooms: 1 **€ 120,000** we have a lot to offer ================ **продажа** reg.1053 lic.489/e **стильные апартаменты вид на море •••kissonerga. пафос. ** •общая площадь: 85м2 + балкон •спальни: 2 •ванные комнаты: 1 **€ 120 000 ****+35726935826**** директ telegram 24/7** у нас есть что вам предложить",0
4,"важно: [valerii korol](tg://user?id=193474890), если ты не бот и не спамер, пройди проверку, нажав на кнопку, где есть",0
5,"аренда no: 367/e ️ларнака️между пила и декелия ️ в пешей доступности от моря. тихий район. вилла 3 спальни, 3 санузла просторная веранда с летней мебелью открытого плана кухня (духовка, вытяжка и кухонная плита, посудомоечная машинка стиральная машина и сушилка) бассейн отопление крытая жилая площадь составляет около 300 кв.м. дом полностью обставлен современной мебелью. цена: 2,100 евро. один депозит 2 предоплаты. подробнее по телефону +35797726055",0
6,привет ищу виллу посуточно с бюджетом 2000€ в сутки от 1 до 3 дней с 11 по 15 августа примерно вилла нужна для видеосъемки любой город или посёлок современный дизайн крайне важен пишите мне в личку или по номеру 95727146 юля,0
7,"важно: [liss](tg://user?id=202814885), если ты не бот и не спамер, пройди проверку, нажав на кнопку, где есть",0
8,total messages: 126772,0
9,"аренда ️ларнака ️в центре города️ saint lazaro church ️ всё в пешей доступности до центральной набережной финекудес 5 минут. no: 367/е квартира 3 спальня 2 санузел кухня открытого плана, все электро приборы стиральная машинка посудомоечная машинка балконы кондиционеры парковка стоимость: 1.300 евро торг уместен подробнее по телефону +35797726055",0
