In [14]:
import logging
import os

import sqlite3
import pandas as pd

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    handlers=[
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)
logger.setLevel('INFO')

db_file = 'data/messages.db'

conn = sqlite3.connect(db_file, check_same_thread=True)

def run_sql(sql_str, db_con=conn, cols = None):
    with db_con as con:
        res = pd.DataFrame(con.execute(sql_str).fetchall(), columns=cols)
    return res
        
sql_str = """
    SELECT 
        name
    FROM 
        sqlite_master
    WHERE 
        name NOT LIKE 'sqlite_%';
"""
run_sql(sql_str, cols=['table_name'])


Unnamed: 0,table_name
0,tg_messages


In [15]:
TABLE_NAME = 'tg_messages'

table_df = pd.read_sql_query(f"SELECT * from {TABLE_NAME} LIMIT 10", conn)

table_df.head()

Unnamed: 0,id,msg,channel,msg_hash
0,2248,—É–ø—Ä–∞–≤–ª—è—é—â–∞—è –∫–æ–º–ø–∞–Ω–∏—è –ø—Ä–µ–¥–ª–∞–≥–∞–µ—Ç –≤ –∞—Ä–µ–Ω–¥—É 1 –æ–¥–Ω...,rentinlimassol,314c320246f78c0db8ddb8489072f7ab
1,2246,—É–ø—Ä–∞–≤–ª—è—é—â–∞—è –∫–æ–º–ø–∞–Ω–∏—è –ø—Ä–µ–¥–ª–∞–≥–∞–µ—Ç –≤ –∞—Ä–µ–Ω–¥—É —Ü–µ–ª–æ–µ...,rentinlimassol,6f3d312ef2c7f712ba0f761556492df0
2,2238,—É–ø—Ä–∞–≤–ª—è—é—â–∞—è –∫–æ–º–ø–∞–Ω–∏—è –ø—Ä–µ–¥–ª–∞–≥–∞–µ—Ç –≤ –∞—Ä–µ–Ω–¥—É —Ä–æ—Å–∫–æ...,rentinlimassol,cb3cb9c0da0195b069392b849003d84f
3,2232,—É–ø—Ä–∞–≤–ª—è—é—â–∞—è –∫–æ–º–ø–∞–Ω–∏—è –ø—Ä–µ–¥–ª–∞–≥–∞–µ—Ç –≤ –∞—Ä–µ–Ω–¥—É —Å–æ–≤—Ä–µ...,rentinlimassol,3f29b9e54210ea6c4ea3cafb27d04a9b
4,2224,—É–ø—Ä–∞–≤–ª—è—é—â–∞—è –∫–æ–º–ø–∞–Ω–∏—è –ø—Ä–µ–¥–ª–∞–≥–∞–µ—Ç –≤ –∞—Ä–µ–Ω–¥—É —Ä–æ—Å–∫–æ...,rentinlimassol,2c697381b6caac91e7b684fa34b86025


In [16]:
sql_str = f"""
    SELECT
        COUNT(*) as num_messages,
        CAST(AVG(length(msg))  as integer) as avg_length,
        COUNT(DISTINCT channel) num_channels
    FROM {TABLE_NAME}
    LIMIT 10
"""

pd.read_sql_query(sql_str, conn)

Unnamed: 0,num_messages,avg_length,num_channels
0,27037,324,6


In [17]:
from jinja2 import Template

def get_neg_samples_df():
    irrelevant_msg_ids = [
        153375, 130177, 152303, 156005, 152225, 152209, 152159, 152129,
        152831, 152766, 152740, 152697, 129161, 129139, 152628, 152556
    ]

    sql_str = Template(
        """
        SELECT 
            id, msg
        FROM {{ table }}
        WHERE id IN (
            {%- for msg_id in msg_ids -%} {{msg_id}} {{"," if not loop.last }} {% endfor %}
        )
        """
    ).render(msg_ids=irrelevant_msg_ids, table=TABLE_NAME)

    neg_samples_df = pd.read_sql_query(sql_str, conn)
    num_neg_samples = neg_samples_df.shape[0]

    logger.info('num rows: %d', num_neg_samples)
    
    return neg_samples_df

neg_samples_df = get_neg_samples_df()
neg_samples_df.head()

2023-01-08 23:31:18,648 num rows: 16


Unnamed: 0,id,msg
0,129161,–ø—Ä–æ–¥–∞–∂–∞ (—Å–æ–±—Å—Ç–≤–µ–Ω–Ω–∏–∫) –ª–∏–º–∞—Å—Å–æ–ª 580 000 –µ–≤—Ä–æ ...
1,129139,–∞—Ä–µ–Ω–¥–∞ –≥–æ—Ä–æ–¥ –ª–∞—Ä–Ω–∞–∫–∞ . —Ä–∞–π–æ–Ω –¥–µ–∫–µ–ª–∏—è . –¥–æ–º —Ç—Ä–∏...
2,152740,"–ø—Ä–æ–¥–∞–∂–∞ –ø—Ä–æ–¥–∞–µ—Ç—Å—è –Ω–æ–≤–∞—è –∫–≤–∞—Ä—Ç–∏—Ä–∞ –∑–∞ ‚Ç¨310,000 ..."
3,152129,**mandarin park: –Ω–æ–≤–æ–µ –≤—ã—Å–æ—Ç–Ω–æ–µ –∑–¥–∞–Ω–∏–µ –≤ –ª–∏–º–∞—Å...
4,152159,**–∫–∞–∫ –ø–æ–ª—É—á–∏—Ç—å –≤–∏–∑–∏—Ç–µ—Ä—Å–∫—É—é –≤–∏–∑—É –Ω–∞ –∫–∏–ø—Ä–µ? –∏–Ω—Å—Ç...


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer


def get_train_set(limit = -1):
    negatives_df = get_neg_samples_df()
    if limit < 0:
        num_negatives = negatives_df.shape[0]
        num_positives = int( num_negatives / 0.1)
        limit = num_positives + num_negatives
    else:
        limit = 8*10**3
    irrelevant_msg_ids = negatives_df['id'].values.tolist()
    sql_str = Template(
        """
        SELECT 
            msg,
            length(msg) as len_msg,
            CASE
                WHEN id IN (
                        {%- for msg_id in msg_ids -%} {{msg_id}} {{"," if not loop.last }} {% endfor %}
                    )
                THEN 1
                ELSE 0
            END target
        FROM {{ table }}
        ORDER BY target DESC
        LIMIT {{ limit }}
        """
    ).render(msg_ids=irrelevant_msg_ids, table=TABLE_NAME, limit=limit)

    corpus_df = pd.read_sql_query(sql_str, conn)
    
    return corpus_df

class Pandas2CSR:
    def __init__(self):
        self.vectorizer = None
        self.txt_col = None
        self.anchor_elements = None
    
    def df_to_matrix(self, input_series):
        res = input_series.values.reshape(-1).tolist()
        
        return res
    
    def fit(self, input_df, text_column='msg'):
        csr_matrix_dataset = self.df_to_matrix(input_df[text_column])
        self.txt_col = text_column
        
        logger.info('num rows: %d', len(csr_matrix_dataset))

        self.vectorizer = TfidfVectorizer()
        X = self.vectorizer.fit_transform(csr_matrix_dataset)
        logger.info('sparse matrix %s', X.shape)
        
        return X
    
    def transform(self, input_df):
        corpus = self.df_to_matrix(input_df[self.txt_col])
        X = self.vectorizer.transform(corpus)
        
        logger.info('result matrix %s', X.shape)
        
        return X
    
    def generate_features(self, neg_samples_df):
        # —Å–æ—Ö—Ä–∞–Ω—è–µ–º —è–∫–æ—Ä–Ω—ã–µ —ç–ª–µ–º–µ–Ω—Ç—ã
        if self.anchor_elements is None:
            self.anchor_elements = self.transform(neg_samples_df)
        anchor_elems

corpus_df = get_train_set()
pandas2csr = Pandas2CSR()
raw_matrix = pandas2csr.fit(corpus_df)

corpus_df.head()

2023-01-08 23:31:20,151 num rows: 16
2023-01-08 23:31:20,184 num rows: 176
2023-01-08 23:31:20,201 sparse matrix (176, 2390)


Unnamed: 0,msg,len_msg,target
0,–ø—Ä–æ–¥–∞–∂–∞ (—Å–æ–±—Å—Ç–≤–µ–Ω–Ω–∏–∫) –ª–∏–º–∞—Å—Å–æ–ª 580 000 –µ–≤—Ä–æ ...,2009,1
1,–∞—Ä–µ–Ω–¥–∞ –≥–æ—Ä–æ–¥ –ª–∞—Ä–Ω–∞–∫–∞ . —Ä–∞–π–æ–Ω –¥–µ–∫–µ–ª–∏—è . –¥–æ–º —Ç—Ä–∏...,140,1
2,"–ø—Ä–æ–¥–∞–∂–∞ –ø—Ä–æ–¥–∞–µ—Ç—Å—è –Ω–æ–≤–∞—è –∫–≤–∞—Ä—Ç–∏—Ä–∞ –∑–∞ ‚Ç¨310,000 ...",588,1
3,**mandarin park: –Ω–æ–≤–æ–µ –≤—ã—Å–æ—Ç–Ω–æ–µ –∑–¥–∞–Ω–∏–µ –≤ –ª–∏–º–∞—Å...,197,1
4,**–∫–∞–∫ –ø–æ–ª—É—á–∏—Ç—å –≤–∏–∑–∏—Ç–µ—Ä—Å–∫—É—é –≤–∏–∑—É –Ω–∞ –∫–∏–ø—Ä–µ? –∏–Ω—Å—Ç...,99,1


In [19]:
from sklearn.metrics.pairwise import euclidean_distances

neg_samples_csr = pandas2csr.transform(neg_samples_df)

distances = euclidean_distances(raw_matrix, neg_samples_csr)
logger.info(distances.shape)

2023-01-08 23:31:20,943 result matrix (16, 2390)
2023-01-08 23:31:20,946 (176, 16)


In [20]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression().fit(distances, corpus_df['target'])



In [21]:
corpus_df = get_train_set(limit=8*10**3)
raw_matrix = pandas2csr.transform(corpus_df)
distances = euclidean_distances(raw_matrix, neg_samples_csr)
neg_example_proba = lr.predict_proba(distances)

2023-01-08 23:31:22,734 num rows: 16
2023-01-08 23:31:23,096 result matrix (8000, 2390)


In [22]:
corpus_df['dummy_label'] = neg_example_proba[:,1]

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.expand_frame_repr', False)

scored_corpus_df = (
    corpus_df.query("len_msg > 0")
    [['msg', 'dummy_label']]
    .sort_values(by='dummy_label', ascending=False)
)

scored_corpus_df.head(270)

Unnamed: 0,msg,dummy_label
3,**mandarin park: –Ω–æ–≤–æ–µ –≤—ã—Å–æ—Ç–Ω–æ–µ –∑–¥–∞–Ω–∏–µ –≤ –ª–∏–º–∞—Å—Å–æ–ª–µ **–≤ —Ä–∞–π–æ–Ω–µ –≥–µ—Ä–º–∞—Å–æ–π—è –≤–æ–∑–≤–µ–¥—É—Ç —ç–ª–∏—Ç–Ω—ã–π –Ω–µ–±–æ—Å–∫—Ä–µ–±. [https://dom.com.cy/live/digest-63766](https://dom.com.cy/live/digest-63766/?utm_source=telegram),0.265801
6,**–Ω–∞ –∫–∏–ø—Ä–µ –ø—Ä–æ–¥–ª–∏–ª–∏ –Ω–∞–ª–æ–≥–æ–≤—ã–µ –ª—å–≥–æ—Ç—ã –ø—Ä–∏ —Ä–µ—Å—Ç—Ä—É–∫—Ç—É—Ä–∏–∑–∞—Ü–∏–∏ –Ω–µ–æ–±—Å–ª—É–∂–∏–≤–∞–µ–º—ã—Ö –∫—Ä–µ–¥–∏—Ç–æ–≤** –¥–æ –∫–∞–∫–æ–≥–æ —á–∏—Å–ª–∞ –ø—Ä–æ–±–ª–µ–º–Ω—ã–µ –∑–∞–µ–º—â–∏–∫–∏ –ø–æ–ª—É—á–∏–ª–∏ –ø–æ—Å–ª–∞–±–ª–µ–Ω–∏—è? [https://dom.com.cy/live/digest-63770](https://dom.com.cy/live/digest-63770/?utm_source=telegram),0.262643
9,"**–ø–æ–ª—É–æ—Å—Ç—Ä–æ–≤ –∞–∫–∞–º–∞—Å: –∑–∞–±—ã—Ç—ã–π —Ä–∞–π –Ω–∞ –∫–∏–ø—Ä–µ ** –æ—Å–æ–±–µ–Ω–Ω–æ—Å—Ç–∏ —Ä–µ–≥–∏–æ–Ω–∞, –ø–ª—é—Å—ã –∏ –º–∏–Ω—É—Å—ã –ø–æ–∫—É–ø–∫–∏ –Ω–µ–¥–≤–∏–∂–∏–º–æ—Å—Ç–∏. [https://dom.com.cy/live/lifehacks-63910](https://dom.com.cy/live/lifehacks-63910/?utm_source=telegram)",0.252452
3939,**—Ä–∞–π–æ–Ω –ø–µ–π—è –≤ –ø–∞—Ñ–æ—Å–µ –≤—Å–∫–æ—Ä–µ –∫–∞—Ä–¥–∏–Ω–∞–ª—å–Ω–æ –ø—Ä–µ–æ–±—Ä–∞–∑–∏—Ç—Å—è** –≤ —Ä–µ–≥–∏–æ–Ω–µ —Ä–µ–∞–ª–∏–∑—É–µ—Ç—Å—è –º–∞—Å—Å–∞ –≤–∞–∂–Ω—ã—Ö –∏–Ω—Ñ—Ä–∞—Å—Ç—Ä—É–∫—Ç—É—Ä–Ω—ã—Ö –ø—Ä–æ–µ–∫—Ç–æ–≤. https://dom.com.cy/live/digest-57332/,0.218025
8,**–ø–æ–¥–∞–≤–ª—è—é—â–µ–µ –±–æ–ª—å—à–∏–Ω—Å—Ç–≤–æ –∑–¥–∞–Ω–∏–π –ª–∏–º–∞—Å—Å–æ–ª–∞ —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω–æ –≤ —Å–µ–π—Å–º–æ–æ–ø–∞—Å–Ω–æ–π –∑–æ–Ω–µ** –æ–ø–∞—Å–Ω–æ –ª–∏ —ç—Ç–æ –¥–ª—è –∂–∏—Ç–µ–ª–µ–π –≥–æ—Ä–æ–¥–∞? https://dom.com.cy/live/digest-63892/?utm_source=telegram,0.201085
13,"**–≤ –ª–∏–º–∞—Å—Å–æ–ª–µ –∑–∞–≤–µ—Ä—à–∏–ª–æ—Å—å —Å—Ç—Ä–æ–∏—Ç–µ–ª—å—Å—Ç–≤–æ –∑–¥–∞–Ω–∏—è –¥–ª—è absolute institute of technical education **—Å—Ç–æ–∏–º–æ—Å—Ç—å –ø—Ä–æ–µ–∫—Ç–∞ –æ—Ü–µ–Ω–∏–≤–∞–µ—Ç—Å—è –≤ 8,5 –º–ª–Ω –µ–≤—Ä–æ. https://dom.com.cy/live/digest-63948/?utm_source=telegram",0.192891
4732,"œåœÑŒ±ŒΩ œÄŒ±œÅŒ±œÄŒ≠ŒºœÄŒµœÑŒµ ŒøœÄŒøŒπŒøŒ¥ŒÆœÄŒøœÑŒµ ŒºŒ≠ŒªŒøœÇ œÉœÑŒ∑ŒΩ ŒµœÄŒ≠ŒΩŒ¥œÖœÉŒ∑ sarwacapitals, Œ¥ŒπŒ∫Œ±ŒπŒøœçœÉœÑŒµ 5% œÉŒµ Œ∫Œ¨Œ∏Œµ Œ∫Œ±œÑŒ¨Œ∏ŒµœÉŒ∑ œÄŒøœÖ Œ∫Œ¨ŒΩŒµœÑŒµ Œ±œÄœå œÑŒ∑ŒΩ downline œÉŒ±œÇ. Œ≥ŒπŒ± œÑŒø Œ¥ŒµœçœÑŒµœÅŒø ŒµœÄŒØœÄŒµŒ¥Œø, Œ∏Œ± ŒªŒ¨Œ≤ŒµœÑŒµ 3% œÄœÅŒøŒºŒÆŒ∏ŒµŒπŒ± œÉœÖŒΩŒµœÅŒ≥Œ±œÑœéŒΩ Œ∫Œ±Œπ Œ≥ŒπŒ± œÑŒø œÑœÅŒØœÑŒø ŒµœÄŒØœÄŒµŒ¥Œø Œ∏Œ± ŒªŒ¨Œ≤ŒµœÑŒµ 1% œÄœÅŒøŒºŒÆŒ∏ŒµŒπŒ± Œ∏œÖŒ≥Œ±œÑœÅŒπŒ∫œéŒΩ. ŒºœÄŒøœÅŒµŒØœÑŒµ ŒΩŒ± ŒµœÄŒ±ŒΩŒµœÄŒµŒΩŒ¥œçœÉŒµœÑŒµ œÑŒ∑ŒΩ œÄœÅŒøŒºŒÆŒ∏ŒµŒπŒ± œÉŒ±œÇ ŒÆ ŒΩŒ± Œ∫Œ¨ŒΩŒµœÑŒµ Œ±ŒΩŒ¨ŒªŒ∑œàŒ∑ Œ±ŒΩŒ¨ œÄŒ¨œÉŒ± œÉœÑŒπŒ≥ŒºŒÆ.",0.189911
7648,0mgmgd1,0.189911
2970,—Ñ—Ä–æ–Ω—Ç–µ–Ω–¥ —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞,0.189911
4413,–∑—É–±–Ω–æ–π? –ø–æ–ª–æ–≤–æ–π?,0.189911


In [23]:
scored_corpus_df.to_csv('data/scored_corpus.csv', index=False)
logger.info('%d lines saved', scored_corpus_df.shape[0])

2023-01-08 23:31:27,156 6542 lines saved


In [24]:
df = pd.read_csv('data/labeled_data_corpus.csv')

train_df = df[df['subset'] == 'train']
test_df = df[df['subset'] == 'test']
print(train_df.shape[0], train_df['label'].mean(), test_df.shape[0], test_df['label'].mean())

df.head()

5233 0.20045862793808522 1309 0.20091673032849502


Unnamed: 0,msg_id,msg,label,subset
0,0,"–∑–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ. –∏—à—É 2—Ö —Å–ø–∞–ª—å–Ω—É—é –∫–≤–∞—Ä—Ç–∏—Ä—É –≤ –ª–∏–º–∞—Å—Å–æ–ª–µ. –∂–µ–ª–∞—Ç–µ–ª—å–Ω–æ –≥–µ—Ä–º–∞—Å–æ–π—è. —Å–µ–º—å—è –∏–∑ 2—Ö –≤–∑—Ä–æ—Å–ª—ã—Ö –∏ 2—Ö –¥–µ—Ç–µ–π. –±–µ–∑ –∂–∏–≤–æ—Ç–Ω—ã—Ö. –Ω–∞ –¥–ª–∏—Ç–µ–ª—å–Ω—ã–π —Å—Ä–æ–∫, –±—é–¥–∂–µ—Ç –¥–æ 1000-1500 –µ–≤—Ä–æ. –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è –≤ –ª—Å.",0,train
1,1,#—Å–Ω–∏–º—É –∫–æ–º–Ω–∞—Ç—É –≤ –ª–∏–º–∞—Å—Å–æ–ª–µ –∏–ª–∏ –Ω–µ–¥–∞–ª–µ–∫–æ –æ—Ç –Ω–µ–≥–æ. —Å –Ω–∞—á–∞–ª–∞ –∞–≤–≥—É—Å—Ç–∞. –ª—é–±—ã–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è –≤ –ª—Å,0,train
2,2,–º–æ—à–µ–Ω–Ω–∏–∫ —Ä–∏—ç–ª—Ç–æ—Ä—Å–∫–∏–º —É—Å–ª—É–≥–∞–º.,0,train
3,3,"**sales** reg.1053 lic.489/e **stylish apartment with sea view kissonerga. paphos** ‚Ä¢total area: 85 m2 + balcony ‚Ä¢bedrooms: 2 ‚Ä¢bathrooms: 1 **‚Ç¨ 120,000** we have a lot to offer ================ **–ø—Ä–æ–¥–∞–∂–∞** reg.1053 lic.489/e **—Å—Ç–∏–ª—å–Ω—ã–µ –∞–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç—ã –≤–∏–¥ –Ω–∞ –º–æ—Ä–µ ‚Ä¢‚Ä¢‚Ä¢kissonerga. –ø–∞—Ñ–æ—Å. ** ‚Ä¢–æ–±—â–∞—è –ø–ª–æ—â–∞–¥—å: 85–º2 + –±–∞–ª–∫–æ–Ω ‚Ä¢—Å–ø–∞–ª—å–Ω–∏: 2 ‚Ä¢–≤–∞–Ω–Ω—ã–µ –∫–æ–º–Ω–∞—Ç—ã: 1 **‚Ç¨ 120 000 ****+35726935826**** –¥–∏—Ä–µ–∫—Ç telegram 24/7** —É –Ω–∞—Å –µ—Å—Ç—å —á—Ç–æ –≤–∞–º –ø—Ä–µ–¥–ª–æ–∂–∏—Ç—å",0,train
4,4,"–≤–∞–∂–Ω–æ: [valerii korol](tg://user?id=193474890), –µ—Å–ª–∏ —Ç—ã –Ω–µ –±–æ—Ç –∏ –Ω–µ —Å–ø–∞–º–µ—Ä, –ø—Ä–æ–π–¥–∏ –ø—Ä–æ–≤–µ—Ä–∫—É, –Ω–∞–∂–∞–≤ –Ω–∞ –∫–Ω–æ–ø–∫—É, –≥–¥–µ –µ—Å—Ç—å",0,train


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.01, 1.0, 0.01),
    'min_df': np.arange(1, 20, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    vectorizer = TfidfVectorizer(**param_values).fit(X_train)
    X_train_csr = vectorizer.transform(X_train)
    lr = LogisticRegression().fit(X_train_csr, y_train)
    # predict
    X_test_csr = vectorizer.transform(X_test)
    y_pred = lr.predict(X_test_csr)
    cur_score = f1_score(y_true, y_pred)
    if cur_score > best_score:
        best_score = cur_score
        best_params.update(param_values)
    if cnt % 250 == 0:
        logging.info(
            'iteration: %d of %d; %s; best_score= %.4f',
            cnt, num_iters, param_values, best_score
        )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score)

2023-01-08 10:46:46,876 iteration: 0 of 1881; {'max_df': 0.01, 'min_df': 1}; best_score= 0.6209
2023-01-08 10:49:04,406 iteration: 250 of 1881; {'max_df': 0.14, 'min_df': 4}; best_score= 0.8549
2023-01-08 10:51:26,067 iteration: 500 of 1881; {'max_df': 0.27, 'min_df': 7}; best_score= 0.8605
2023-01-08 10:53:48,101 iteration: 750 of 1881; {'max_df': 0.4, 'min_df': 10}; best_score= 0.8605
2023-01-08 10:56:05,337 iteration: 1000 of 1881; {'max_df': 0.53, 'min_df': 13}; best_score= 0.8605
2023-01-08 10:58:29,749 iteration: 1250 of 1881; {'max_df': 0.66, 'min_df': 16}; best_score= 0.8605
2023-01-08 11:01:04,276 iteration: 1500 of 1881; {'max_df': 0.79, 'min_df': 19}; best_score= 0.8605
2023-01-08 11:03:36,671 iteration: 1750 of 1881; {'max_df': 0.93, 'min_df': 3}; best_score= 0.8605


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 14} 0.8605108055009824


In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.01, 0.2, 0.01),
    'min_df': np.arange(1, 15, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    vectorizer = TfidfVectorizer(**param_values).fit(X_train)
    X_train_csr = vectorizer.transform(X_train)
    lr = LogisticRegression().fit(X_train_csr, y_train)
    #gbr = GradientBoostingRegressor(random_state=0).fit(X_train_csr, y_train)
    # predict
    X_test_csr = vectorizer.transform(X_test)
    y_pred = lr.predict(X_test_csr)
    cur_score = f1_score(y_true, y_pred)
    if cur_score > best_score:
        best_score = cur_score
        best_params.update(param_values)
    if cnt % 250 == 0:
        logging.info(
            'iteration: %d of %d; %s; best_score= %.4f',
            cnt, num_iters, param_values, best_score
        )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score)

2023-01-08 11:57:51,289 iteration: 0 of 266; {'max_df': 0.01, 'min_df': 1}; best_score= 0.6209
2023-01-08 12:06:01,913 iteration: 250 of 266; {'max_df': 0.18000000000000002, 'min_df': 13}; best_score= 0.8605


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 14} 0.8605108055009824


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.1, 0.2, 0.01),
    'min_df': np.arange(1, 15, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    vectorizer = TfidfVectorizer(**param_values).fit(X_train)
    X_train_csr = vectorizer.transform(X_train)
    lr = LogisticRegression().fit(X_train_csr, y_train)
    #gbr = GradientBoostingRegressor(random_state=0).fit(X_train_csr, y_train)
    # predict
    X_test_csr = vectorizer.transform(X_test)
    y_pred = lr.predict(X_test_csr)
    cur_score = f1_score(y_true, y_pred)
    if cur_score > best_score:
        best_score = cur_score
        best_params.update(param_values)
    if cnt % 250 == 0:
        logging.info(
            'iteration: %d of %d; %s; best_score= %.4f',
            cnt, num_iters, param_values, best_score
        )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score)

2023-01-08 13:13:28,172 iteration: 0 of 140; {'max_df': 0.1, 'min_df': 1}; best_score= 0.8247


best params: %s, best_score %.4f {'max_df': 0.15999999999999998, 'min_df': 14} 0.8605108055009824


In [34]:
df.reset_index(drop=True).rename(columns={'index': 'msg_id'}).head()


Unnamed: 0,msg_id,msg,label,subset
0,0,"–∑–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ. –∏—à—É 2—Ö —Å–ø–∞–ª—å–Ω—É—é –∫–≤–∞—Ä—Ç–∏—Ä—É –≤ –ª–∏–º–∞—Å—Å–æ–ª–µ. –∂–µ–ª–∞—Ç–µ–ª—å–Ω–æ –≥–µ—Ä–º–∞—Å–æ–π—è. —Å–µ–º—å—è –∏–∑ 2—Ö –≤–∑—Ä–æ—Å–ª—ã—Ö –∏ 2—Ö –¥–µ—Ç–µ–π. –±–µ–∑ –∂–∏–≤–æ—Ç–Ω—ã—Ö. –Ω–∞ –¥–ª–∏—Ç–µ–ª—å–Ω—ã–π —Å—Ä–æ–∫, –±—é–¥–∂–µ—Ç –¥–æ 1000-1500 –µ–≤—Ä–æ. –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è –≤ –ª—Å.",0,train
1,1,#—Å–Ω–∏–º—É –∫–æ–º–Ω–∞—Ç—É –≤ –ª–∏–º–∞—Å—Å–æ–ª–µ –∏–ª–∏ –Ω–µ–¥–∞–ª–µ–∫–æ –æ—Ç –Ω–µ–≥–æ. —Å –Ω–∞—á–∞–ª–∞ –∞–≤–≥—É—Å—Ç–∞. –ª—é–±—ã–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è –≤ –ª—Å,0,train
2,2,–º–æ—à–µ–Ω–Ω–∏–∫ —Ä–∏—ç–ª—Ç–æ—Ä—Å–∫–∏–º —É—Å–ª—É–≥–∞–º.,0,train
3,3,"**sales** reg.1053 lic.489/e **stylish apartment with sea view kissonerga. paphos** ‚Ä¢total area: 85 m2 + balcony ‚Ä¢bedrooms: 2 ‚Ä¢bathrooms: 1 **‚Ç¨ 120,000** we have a lot to offer ================ **–ø—Ä–æ–¥–∞–∂–∞** reg.1053 lic.489/e **—Å—Ç–∏–ª—å–Ω—ã–µ –∞–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç—ã –≤–∏–¥ –Ω–∞ –º–æ—Ä–µ ‚Ä¢‚Ä¢‚Ä¢kissonerga. –ø–∞—Ñ–æ—Å. ** ‚Ä¢–æ–±—â–∞—è –ø–ª–æ—â–∞–¥—å: 85–º2 + –±–∞–ª–∫–æ–Ω ‚Ä¢—Å–ø–∞–ª—å–Ω–∏: 2 ‚Ä¢–≤–∞–Ω–Ω—ã–µ –∫–æ–º–Ω–∞—Ç—ã: 1 **‚Ç¨ 120 000 ****+35726935826**** –¥–∏—Ä–µ–∫—Ç telegram 24/7** —É –Ω–∞—Å –µ—Å—Ç—å —á—Ç–æ –≤–∞–º –ø—Ä–µ–¥–ª–æ–∂–∏—Ç—å",0,train
4,4,"–≤–∞–∂–Ω–æ: [valerii korol](tg://user?id=193474890), –µ—Å–ª–∏ —Ç—ã –Ω–µ –±–æ—Ç –∏ –Ω–µ —Å–ø–∞–º–µ—Ä, –ø—Ä–æ–π–¥–∏ –ø—Ä–æ–≤–µ—Ä–∫—É, –Ω–∞–∂–∞–≤ –Ω–∞ –∫–Ω–æ–ø–∫—É, –≥–¥–µ –µ—Å—Ç—å",0,train


2023-01-08 15:05:28,109 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:06:22,518 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:07:18,158 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:08:14,006 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:09:09,305 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8638132295719844 0.1


In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.16, 0.17, 0.01),
    'min_df': np.arange(4, 5, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    for n_es in np.arange(450, 551, 25):
        vectorizer = TfidfVectorizer(**param_values).fit(X_train)
        X_train_csr = vectorizer.transform(X_train)
        #lr = LogisticRegression().fit(X_train_csr, y_train)
        gbr = GradientBoostingRegressor(random_state=1, n_estimators=n_es, learning_rate=best_l_r).fit(X_train_csr, y_train)
        # predict
        X_test_csr = vectorizer.transform(X_test)
        y_pred = gbr.predict(X_test_csr)
        y_pred = [0 if i < 0.5 else 1 for i in y_pred]
        cur_score = f1_score(y_true, y_pred)
        if cur_score > best_score:
            best_score = cur_score
            best_params.update(param_values)
            best_n_es = n_es
        if cnt % 250 == 0:
            logging.info(
                'iteration: %d of %d; %s; best_score= %.4f',
                cnt, num_iters, param_values, best_score
            )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score, best_n_es)

2023-01-08 15:14:44,102 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8599
2023-01-08 15:15:37,405 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8621
2023-01-08 15:16:33,400 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:17:32,534 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:18:34,396 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8638132295719844 500


In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.16, 0.17, 0.01),
    'min_df': np.arange(4, 5, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    for l_r in np.arange(0.05, 0.151, 0.025):
        vectorizer = TfidfVectorizer(**param_values).fit(X_train)
        X_train_csr = vectorizer.transform(X_train)
        #lr = LogisticRegression().fit(X_train_csr, y_train)
        gbr = GradientBoostingRegressor(random_state=1, n_estimators=best_n_es, learning_rate=l_r).fit(X_train_csr, y_train)
        # predict
        X_test_csr = vectorizer.transform(X_test)
        y_pred = gbr.predict(X_test_csr)
        y_pred = [0 if i < 0.5 else 1 for i in y_pred]
        cur_score = f1_score(y_true, y_pred)
        if cur_score > best_score:
            best_score = cur_score
            best_params.update(param_values)
            best_l_r = l_r
        if cnt % 250 == 0:
            logging.info(
                'iteration: %d of %d; %s; best_score= %.4f',
                cnt, num_iters, param_values, best_score
            )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score, best_l_r)

2023-01-08 15:41:32,400 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8527
2023-01-08 15:42:17,791 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8588
2023-01-08 15:43:02,766 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:43:47,482 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 15:44:32,101 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8638132295719844 0.10000000000000002


In [72]:
best_n_es = 500
best_l_r = 0.1

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.16, 0.17, 0.01),
    'min_df': np.arange(4, 5, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    for d in np.arange(4, 10, 1):
        vectorizer = TfidfVectorizer(**param_values).fit(X_train)
        X_train_csr = vectorizer.transform(X_train)
        #lr = LogisticRegression().fit(X_train_csr, y_train)
        gbr = GradientBoostingRegressor(random_state=1, n_estimators=best_n_es, learning_rate=best_l_r, max_depth=d).fit(X_train_csr, y_train)
        # predict
        X_test_csr = vectorizer.transform(X_test)
        y_pred = gbr.predict(X_test_csr)
        y_pred = [0 if i < 0.5 else 1 for i in y_pred]
        cur_score = f1_score(y_true, y_pred)
        if cur_score > best_score:
            best_score = cur_score
            best_params.update(param_values)
            best_d = d
        if cnt % 250 == 0:
            logging.info(
                'iteration: %d of %d; %s; best_score= %.4f',
                cnt, num_iters, param_values, best_score
            )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score, best_d)

2023-01-08 16:28:48,493 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8638
2023-01-08 16:30:18,291 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8654
2023-01-08 16:32:06,421 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8654
2023-01-08 16:34:10,017 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8654
2023-01-08 16:36:32,718 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8654
2023-01-08 16:39:14,381 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8654


best params: %s, best_score %.4f {'max_df': 0.17, 'min_df': 4} 0.8675623800383877 8


In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.16, 0.17, 0.01),
    'min_df': np.arange(4, 5, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    for rs in np.arange(3, 10, 1):
        vectorizer = TfidfVectorizer(**param_values).fit(X_train)
        X_train_csr = vectorizer.transform(X_train)
        #lr = LogisticRegression().fit(X_train_csr, y_train)
        gbr = GradientBoostingRegressor(random_state=rs, n_estimators=best_n_es, max_depth=best_d).fit(X_train_csr, y_train)
        # predict
        X_test_csr = vectorizer.transform(X_test)
        y_pred = gbr.predict(X_test_csr)
        y_pred = [0 if i < 0.5 else 1 for i in y_pred]
        cur_score = f1_score(y_true, y_pred)
        if cur_score > best_score:
            best_score = cur_score
            best_params.update(param_values)
            best_rs = rs
        if cnt % 250 == 0:
            logging.info(
                'iteration: %d of %d; %s; best_score= %.4f',
                cnt, num_iters, param_values, best_score
            )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score, best_rs)

2023-01-08 17:41:15,839 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8627
2023-01-08 17:41:59,091 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8637
2023-01-08 17:42:42,143 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 17:43:25,148 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 17:44:08,252 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 17:44:50,581 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 17:45:32,869 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8680688336520077 5


In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.15, 0.18, 0.01),
    'min_df': np.arange(4, 5, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    for d in np.arange(8, 10, 1):
        vectorizer = TfidfVectorizer(**param_values).fit(X_train)
        X_train_csr = vectorizer.transform(X_train)
        #lr = LogisticRegression().fit(X_train_csr, y_train)
        gbr = GradientBoostingRegressor(random_state=best_rs, n_estimators=best_n_es, max_depth=d).fit(X_train_csr, y_train)
        # predict
        X_test_csr = vectorizer.transform(X_test)
        y_pred = gbr.predict(X_test_csr)
        y_pred = [0 if i < 0.5 else 1 for i in y_pred]
        cur_score = f1_score(y_true, y_pred)
        if cur_score > best_score:
            best_score = cur_score
            best_params.update(param_values)
            best_d = d
        if cnt % 250 == 0:
            logging.info(
                'iteration: %d of %d; %s; best_score= %.4f',
                cnt, num_iters, param_values, best_score
            )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score, best_d)

2023-01-08 18:49:37,428 iteration: 0 of 3; {'max_df': 0.15, 'min_df': 4}; best_score= 0.8423
2023-01-08 18:52:12,685 iteration: 0 of 3; {'max_df': 0.15, 'min_df': 4}; best_score= 0.8423


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8680688336520077 8


In [26]:
best_rs = 5
best_n_es = 500
best_d = 8

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.16, 0.17, 0.01),
    'min_df': np.arange(4, 5, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    for rs in np.arange(3, 10, 1):
        vectorizer = TfidfVectorizer(**param_values).fit(X_train)
        X_train_csr = vectorizer.transform(X_train)
        #lr = LogisticRegression().fit(X_train_csr, y_train)
        gbr = GradientBoostingRegressor(random_state=rs, n_estimators=best_n_es, max_depth=best_d).fit(X_train_csr, y_train)
        # predict
        X_test_csr = vectorizer.transform(X_test)
        y_pred = gbr.predict(X_test_csr)
        y_pred = [0 if i < 0.5 else 1 for i in y_pred]
        cur_score = f1_score(y_true, y_pred)
        if cur_score > best_score:
            best_score = cur_score
            best_params.update(param_values)
            best_rs = rs
        if cnt % 250 == 0:
            logging.info(
                'iteration: %d of %d; %s; best_score= %.4f',
                cnt, num_iters, param_values, best_score
            )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score, best_rs)

2023-01-08 23:35:47,604 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8627
2023-01-08 23:36:30,057 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8637
2023-01-08 23:37:12,723 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 23:37:55,237 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 23:38:37,996 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 23:39:20,386 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681
2023-01-08 23:40:03,293 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8680688336520077 5


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

grid = {
    'max_df': np.arange(0.16, 0.17, 0.01),
    'min_df': np.arange(4, 5, 1),
}

best_params = {'max_df': None, 'min_df': None}
best_score = 0.0

num_iters = len(ParameterGrid(grid))

cnt = 0
for param_values in ParameterGrid(grid):
    # fit 
    vectorizer = TfidfVectorizer(**param_values).fit(X_train)
    X_train_csr = vectorizer.transform(X_train)
    #lr = LogisticRegression().fit(X_train_csr, y_train)
    gbr = GradientBoostingRegressor(alpha=0.9, random_state=5, n_estimators=best_n_es, max_depth=best_d).fit(X_train_csr, y_train)
    # predict
    X_test_csr = vectorizer.transform(X_test)
    y_pred = gbr.predict(X_test_csr)
    y_pred = [0 if i < 0.5 else 1 for i in y_pred]
    cur_score = f1_score(y_true, y_pred)
    if cur_score > best_score:
        best_score = cur_score
        best_params.update(param_values)
        best_a = a
    if cnt % 250 == 0:
        logging.info(
            'iteration: %d of %d; %s; best_score= %.4f',
            cnt, num_iters, param_values, best_score
        )
    cnt = cnt + 1
print('best params: %s, best_score %.4f', best_params, best_score, best_a, best_rs, best_n_es, best_d)

2023-01-09 00:35:31,196 iteration: 0 of 2; {'max_df': 0.16, 'min_df': 4}; best_score= 0.8681


best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8680688336520077 0.9 5 500 8


In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']


vectorizer = TfidfVectorizer(**param_values).fit(X_train)
X_train_csr = vectorizer.transform(X_train)
gbr = GradientBoostingRegressor(random_state=1, n_estimators=500, max_depth=8).fit(X_train_csr, y_train)
# predict
X_test_csr = vectorizer.transform(X_test)
y_pred = gbr.predict(X_test_csr)
y_pred = [0 if i < 0.5 else 1 for i in y_pred]
cur_score = f1_score(y_true, y_pred)

print('best params: %s, best_score %.4f', param_values, cur_score)

{'max_df': 0.16, 'min_df': 4}
best params: %s, best_score %.4f {'max_df': 0.16, 'min_df': 4} 0.8675623800383877


In [46]:
import pickle

model = GradientBoostingRegressor(random_state=1, n_estimators=best_n_es, max_depth=best_d).fit(X_train_csr, y_train)

pkl_filename = "model.pkl" 
with open(pkl_filename, 'wb') as file: 
    pickle.dump(model, file)

In [43]:
vectorizer = TfidfVectorizer(**param_values).fit(X_train)

pkl_filename = "vectorizer.pkl" 
with open(pkl_filename, 'wb') as file: 
    pickle.dump(vectorizer, file)

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score

import numpy as np
from sklearn.model_selection import ParameterGrid

X_train = train_df['msg'].values
y_train = train_df['label']

X_test = test_df['msg'].values
y_true = test_df['label']

X_train_csr = vectorizer.transform(X_train)
gbr = model
# predict
X_test_csr = vectorizer.transform(X_test)
y_pred = gbr.predict(X_test_csr)
y_pred = [0 if i < 0.5 else 1 for i in y_pred]
cur_score = f1_score(y_true, y_pred)

print('best_score %.4f', cur_score)


best_score %.4f 0.8675623800383877


In [14]:
df.reset_index(drop=True).rename(columns={'index': 'msg_id'}).head()


Unnamed: 0,msg_id,msg,label,subset
0,0,"–∑–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ. –∏—à—É 2—Ö —Å–ø–∞–ª—å–Ω—É—é –∫–≤–∞—Ä—Ç–∏—Ä—É –≤ –ª–∏–º–∞—Å—Å–æ–ª–µ. –∂–µ–ª–∞—Ç–µ–ª—å–Ω–æ –≥–µ—Ä–º–∞—Å–æ–π—è. —Å–µ–º—å—è –∏–∑ 2—Ö –≤–∑—Ä–æ—Å–ª—ã—Ö –∏ 2—Ö –¥–µ—Ç–µ–π. –±–µ–∑ –∂–∏–≤–æ—Ç–Ω—ã—Ö. –Ω–∞ –¥–ª–∏—Ç–µ–ª—å–Ω—ã–π —Å—Ä–æ–∫, –±—é–¥–∂–µ—Ç –¥–æ 1000-1500 –µ–≤—Ä–æ. –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è –≤ –ª—Å.",0,train
1,1,#—Å–Ω–∏–º—É –∫–æ–º–Ω–∞—Ç—É –≤ –ª–∏–º–∞—Å—Å–æ–ª–µ –∏–ª–∏ –Ω–µ–¥–∞–ª–µ–∫–æ –æ—Ç –Ω–µ–≥–æ. —Å –Ω–∞—á–∞–ª–∞ –∞–≤–≥—É—Å—Ç–∞. –ª—é–±—ã–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è –≤ –ª—Å,0,train
2,2,–º–æ—à–µ–Ω–Ω–∏–∫ —Ä–∏—ç–ª—Ç–æ—Ä—Å–∫–∏–º —É—Å–ª—É–≥–∞–º.,0,train
3,3,"**sales** reg.1053 lic.489/e **stylish apartment with sea view kissonerga. paphos** ‚Ä¢total area: 85 m2 + balcony ‚Ä¢bedrooms: 2 ‚Ä¢bathrooms: 1 **‚Ç¨ 120,000** we have a lot to offer ================ **–ø—Ä–æ–¥–∞–∂–∞** reg.1053 lic.489/e **—Å—Ç–∏–ª—å–Ω—ã–µ –∞–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç—ã –≤–∏–¥ –Ω–∞ –º–æ—Ä–µ ‚Ä¢‚Ä¢‚Ä¢kissonerga. –ø–∞—Ñ–æ—Å. ** ‚Ä¢–æ–±—â–∞—è –ø–ª–æ—â–∞–¥—å: 85–º2 + –±–∞–ª–∫–æ–Ω ‚Ä¢—Å–ø–∞–ª—å–Ω–∏: 2 ‚Ä¢–≤–∞–Ω–Ω—ã–µ –∫–æ–º–Ω–∞—Ç—ã: 1 **‚Ç¨ 120 000 ****+35726935826**** –¥–∏—Ä–µ–∫—Ç telegram 24/7** —É –Ω–∞—Å –µ—Å—Ç—å —á—Ç–æ –≤–∞–º –ø—Ä–µ–¥–ª–æ–∂–∏—Ç—å",0,train
4,4,"–≤–∞–∂–Ω–æ: [valerii korol](tg://user?id=193474890), –µ—Å–ª–∏ —Ç—ã –Ω–µ –±–æ—Ç –∏ –Ω–µ —Å–ø–∞–º–µ—Ä, –ø—Ä–æ–π–¥–∏ –ø—Ä–æ–≤–µ—Ä–∫—É, –Ω–∞–∂–∞–≤ –Ω–∞ –∫–Ω–æ–ø–∫—É, –≥–¥–µ –µ—Å—Ç—å",0,train


In [26]:
df.reset_index(drop=True).rename(columns={'index': 'msg_id'}).to_csv('data/labeled_data_corpus2.csv', index=False)


In [27]:
df2 = pd.read_csv('data/labeled_data_corpus2.csv')
df2.head()

Unnamed: 0,msg,label
0,"–∑–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ. –∏—à—É 2—Ö —Å–ø–∞–ª—å–Ω—É—é –∫–≤–∞—Ä—Ç–∏—Ä—É –≤ –ª–∏–º–∞—Å—Å–æ–ª–µ. –∂–µ–ª–∞—Ç–µ–ª—å–Ω–æ –≥–µ—Ä–º–∞—Å–æ–π—è. —Å–µ–º—å—è –∏–∑ 2—Ö –≤–∑—Ä–æ—Å–ª—ã—Ö –∏ 2—Ö –¥–µ—Ç–µ–π. –±–µ–∑ –∂–∏–≤–æ—Ç–Ω—ã—Ö. –Ω–∞ –¥–ª–∏—Ç–µ–ª—å–Ω—ã–π —Å—Ä–æ–∫, –±—é–¥–∂–µ—Ç –¥–æ 1000-1500 –µ–≤—Ä–æ. –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è –≤ –ª—Å.",0
1,#—Å–Ω–∏–º—É –∫–æ–º–Ω–∞—Ç—É –≤ –ª–∏–º–∞—Å—Å–æ–ª–µ –∏–ª–∏ –Ω–µ–¥–∞–ª–µ–∫–æ –æ—Ç –Ω–µ–≥–æ. —Å –Ω–∞—á–∞–ª–∞ –∞–≤–≥—É—Å—Ç–∞. –ª—é–±—ã–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è –≤ –ª—Å,0
2,–º–æ—à–µ–Ω–Ω–∏–∫ —Ä–∏—ç–ª—Ç–æ—Ä—Å–∫–∏–º —É—Å–ª—É–≥–∞–º.,0
3,"**sales** reg.1053 lic.489/e **stylish apartment with sea view kissonerga. paphos** ‚Ä¢total area: 85 m2 + balcony ‚Ä¢bedrooms: 2 ‚Ä¢bathrooms: 1 **‚Ç¨ 120,000** we have a lot to offer ================ **–ø—Ä–æ–¥–∞–∂–∞** reg.1053 lic.489/e **—Å—Ç–∏–ª—å–Ω—ã–µ –∞–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç—ã –≤–∏–¥ –Ω–∞ –º–æ—Ä–µ ‚Ä¢‚Ä¢‚Ä¢kissonerga. –ø–∞—Ñ–æ—Å. ** ‚Ä¢–æ–±—â–∞—è –ø–ª–æ—â–∞–¥—å: 85–º2 + –±–∞–ª–∫–æ–Ω ‚Ä¢—Å–ø–∞–ª—å–Ω–∏: 2 ‚Ä¢–≤–∞–Ω–Ω—ã–µ –∫–æ–º–Ω–∞—Ç—ã: 1 **‚Ç¨ 120 000 ****+35726935826**** –¥–∏—Ä–µ–∫—Ç telegram 24/7** —É –Ω–∞—Å –µ—Å—Ç—å —á—Ç–æ –≤–∞–º –ø—Ä–µ–¥–ª–æ–∂–∏—Ç—å",0
4,"–≤–∞–∂–Ω–æ: [valerii korol](tg://user?id=193474890), –µ—Å–ª–∏ —Ç—ã –Ω–µ –±–æ—Ç –∏ –Ω–µ —Å–ø–∞–º–µ—Ä, –ø—Ä–æ–π–¥–∏ –ø—Ä–æ–≤–µ—Ä–∫—É, –Ω–∞–∂–∞–≤ –Ω–∞ –∫–Ω–æ–ø–∫—É, –≥–¥–µ –µ—Å—Ç—å",0
5,"–∞—Ä–µ–Ω–¥–∞ no: 367/e Ô∏è–ª–∞—Ä–Ω–∞–∫–∞Ô∏è–º–µ–∂–¥—É –ø–∏–ª–∞ –∏ –¥–µ–∫–µ–ª–∏—è Ô∏è –≤ –ø–µ—à–µ–π –¥–æ—Å—Ç—É–ø–Ω–æ—Å—Ç–∏ –æ—Ç –º–æ—Ä—è. —Ç–∏—Ö–∏–π —Ä–∞–π–æ–Ω. –≤–∏–ª–ª–∞ 3 —Å–ø–∞–ª—å–Ω–∏, 3 —Å–∞–Ω—É–∑–ª–∞ –ø—Ä–æ—Å—Ç–æ—Ä–Ω–∞—è –≤–µ—Ä–∞–Ω–¥–∞ —Å –ª–µ—Ç–Ω–µ–π –º–µ–±–µ–ª—å—é –æ—Ç–∫—Ä—ã—Ç–æ–≥–æ –ø–ª–∞–Ω–∞ –∫—É—Ö–Ω—è (–¥—É—Ö–æ–≤–∫–∞, –≤—ã—Ç—è–∂–∫–∞ –∏ –∫—É—Ö–æ–Ω–Ω–∞—è –ø–ª–∏—Ç–∞, –ø–æ—Å—É–¥–æ–º–æ–µ—á–Ω–∞—è –º–∞—à–∏–Ω–∫–∞ —Å—Ç–∏—Ä–∞–ª—å–Ω–∞—è –º–∞—à–∏–Ω–∞ –∏ —Å—É—à–∏–ª–∫–∞) –±–∞—Å—Å–µ–π–Ω –æ—Ç–æ–ø–ª–µ–Ω–∏–µ –∫—Ä—ã—Ç–∞—è –∂–∏–ª–∞—è –ø–ª–æ—â–∞–¥—å —Å–æ—Å—Ç–∞–≤–ª—è–µ—Ç –æ–∫–æ–ª–æ 300 –∫–≤.–º. –¥–æ–º –ø–æ–ª–Ω–æ—Å—Ç—å—é –æ–±—Å—Ç–∞–≤–ª–µ–Ω —Å–æ–≤—Ä–µ–º–µ–Ω–Ω–æ–π –º–µ–±–µ–ª—å—é. —Ü–µ–Ω–∞: 2,100 –µ–≤—Ä–æ. –æ–¥–∏–Ω –¥–µ–ø–æ–∑–∏—Ç 2 –ø—Ä–µ–¥–æ–ø–ª–∞—Ç—ã. –ø–æ–¥—Ä–æ–±–Ω–µ–µ –ø–æ —Ç–µ–ª–µ—Ñ–æ–Ω—É +35797726055",0
6,–ø—Ä–∏–≤–µ—Ç –∏—â—É –≤–∏–ª–ª—É –ø–æ—Å—É—Ç–æ—á–Ω–æ —Å –±—é–¥–∂–µ—Ç–æ–º 2000‚Ç¨ –≤ —Å—É—Ç–∫–∏ –æ—Ç 1 –¥–æ 3 –¥–Ω–µ–π —Å 11 –ø–æ 15 –∞–≤–≥—É—Å—Ç–∞ –ø—Ä–∏–º–µ—Ä–Ω–æ –≤–∏–ª–ª–∞ –Ω—É–∂–Ω–∞ –¥–ª—è –≤–∏–¥–µ–æ—Å—ä–µ–º–∫–∏ –ª—é–±–æ–π –≥–æ—Ä–æ–¥ –∏–ª–∏ –ø–æ—Å—ë–ª–æ–∫ —Å–æ–≤—Ä–µ–º–µ–Ω–Ω—ã–π –¥–∏–∑–∞–π–Ω –∫—Ä–∞–π–Ω–µ –≤–∞–∂–µ–Ω –ø–∏—à–∏—Ç–µ –º–Ω–µ –≤ –ª–∏—á–∫—É –∏–ª–∏ –ø–æ –Ω–æ–º–µ—Ä—É 95727146 —é–ª—è,0
7,"–≤–∞–∂–Ω–æ: [liss](tg://user?id=202814885), –µ—Å–ª–∏ —Ç—ã –Ω–µ –±–æ—Ç –∏ –Ω–µ —Å–ø–∞–º–µ—Ä, –ø—Ä–æ–π–¥–∏ –ø—Ä–æ–≤–µ—Ä–∫—É, –Ω–∞–∂–∞–≤ –Ω–∞ –∫–Ω–æ–ø–∫—É, –≥–¥–µ –µ—Å—Ç—å",0
8,total messages: 126772,0
9,"–∞—Ä–µ–Ω–¥–∞ Ô∏è–ª–∞—Ä–Ω–∞–∫–∞ Ô∏è–≤ —Ü–µ–Ω—Ç—Ä–µ –≥–æ—Ä–æ–¥–∞Ô∏è saint lazaro church Ô∏è –≤—Å—ë –≤ –ø–µ—à–µ–π –¥–æ—Å—Ç—É–ø–Ω–æ—Å—Ç–∏ –¥–æ —Ü–µ–Ω—Ç—Ä–∞–ª—å–Ω–æ–π –Ω–∞–±–µ—Ä–µ–∂–Ω–æ–π —Ñ–∏–Ω–µ–∫—É–¥–µ—Å 5 –º–∏–Ω—É—Ç. no: 367/–µ –∫–≤–∞—Ä—Ç–∏—Ä–∞ 3 —Å–ø–∞–ª—å–Ω—è 2 —Å–∞–Ω—É–∑–µ–ª –∫—É—Ö–Ω—è –æ—Ç–∫—Ä—ã—Ç–æ–≥–æ –ø–ª–∞–Ω–∞, –≤—Å–µ —ç–ª–µ–∫—Ç—Ä–æ –ø—Ä–∏–±–æ—Ä—ã —Å—Ç–∏—Ä–∞–ª—å–Ω–∞—è –º–∞—à–∏–Ω–∫–∞ –ø–æ—Å—É–¥–æ–º–æ–µ—á–Ω–∞—è –º–∞—à–∏–Ω–∫–∞ –±–∞–ª–∫–æ–Ω—ã –∫–æ–Ω–¥–∏—Ü–∏–æ–Ω–µ—Ä—ã –ø–∞—Ä–∫–æ–≤–∫–∞ —Å—Ç–æ–∏–º–æ—Å—Ç—å: 1.300 –µ–≤—Ä–æ —Ç–æ—Ä–≥ —É–º–µ—Å—Ç–µ–Ω –ø–æ–¥—Ä–æ–±–Ω–µ–µ –ø–æ —Ç–µ–ª–µ—Ñ–æ–Ω—É +35797726055",0
