In [1]:
import pandas as pd

df_train = pd.read_csv('./dataset/kaggle/train.csv')
df_test = pd.read_csv('./dataset/kaggle/test.csv')

In [2]:
import pandas as pd
import re
from bs4 import BeautifulSoup

# 讀取資料
data = pd.read_csv('./dataset/kaggle/train.csv')
data_test = pd.read_csv('./dataset/kaggle/test.csv')
# data = pd.read_csv('./dataset/kaggle/train.csv')
# data_test = pd.read_csv('./dataset/kaggle/test.csv')

# 定義一個函數來清理HTML內容
def clean_html(soup):
    return soup.get_text()

# 提取標題
def extract_title(soup):
    title_tag = soup.find('h1')
    return title_tag.get_text() if title_tag else None

# 提取作者
def extract_author(soup):
    article_info = soup.find('div', class_='article-info')

    if article_info:
        author_name = article_info.find('span', class_='author_name')
        if author_name:
            author = author_name.get_text()
        elif article_info.span:
            author = article_info.span.get_text()
        elif article_info.a:
            author = article_info.a.get_text()
        else:
            author = "none"
    else:
        author = "none"

    # 清理作者名稱
    author = re.sub(r'\s+', ' ', author.strip().lower())
    if author.startswith('by '):
        author = author[3:]  # 去掉前綴 'by '
    author = re.sub(r'&.*;', '&', author.replace(' and ', ' & '))

    # 分離作者
    author_list = re.split(r'\s*,\s*|\s*&\s*', author)
    author = ' '.join([re.sub(r'\s+', '_', a) for a in author_list])

    return author

# 提取主題
def extract_topic(soup):
    topic_tag = soup.find('footer', class_='article-topics')
    if topic_tag:
        return ', '.join([a.get_text().strip() for a in topic_tag.find_all('a')])
    return "none"

# 提取時間
def extract_time(soup):
    time_tag = soup.find('time')
    return time_tag['datetime'] if time_tag and 'datetime' in time_tag.attrs else "none"

# 定義一個處理函數，避免重複使用 BeautifulSoup
def process_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    cleaned_content = clean_html(soup)
    title = extract_title(soup)
    author = extract_author(soup)
    topic = extract_topic(soup)
    time = extract_time(soup)

    # 移除作者和時間
    cleaned_content = re.sub(r'(?i)(by\s+[\w\s]+|for\s+[\w\s]+)\s*\d{4}-\d{2}-\d{2}\s*\d{2}:\d{2}:\d{2}\s*UTC?', '', cleaned_content)

    return cleaned_content, title, author, topic, time

# 應用處理函數到所有資料
data[['cleaned_content', 'title', 'author', 'topic', 'time']] = data['Page content'].apply(
    lambda x: pd.Series(process_content(x))
)
data_test[['cleaned_content', 'title', 'author', 'topic', 'time']] = data_test['Page content'].apply(
    lambda x: pd.Series(process_content(x))
)

# 將 ID 轉換為數字，非數字的會變成 NaN
data['Id'] = pd.to_numeric(data['Id'], errors='coerce')
data_test['Id'] = pd.to_numeric(data_test['Id'], errors='coerce')

# 丟掉 ID 為 NaN 的列
data = data.dropna(subset=['Id'])
data_test = data_test.dropna(subset=['Id'])

# 選擇需要的列
result = data[['Id', 'title', 'author', 'topic', 'time', 'cleaned_content']]
result_test = data_test[['Id', 'title', 'author', 'topic', 'time', 'cleaned_content']]

# 顯示前10筆結果
print(result.head())


   Id                                              title            author  \
0   0  NASA's Grand Challenge: Stop Asteroids From De...   clara_moskowitz   
1   1  Google's New Open Source Patent Pledge: We Won...  christina_warren   
2   2  Ballin': 2014 NFL Draft Picks Get to Choose Th...         sam_laird   
3   3        Cameraperson Fails Deliver Slapstick Laughs         sam_laird   
4   4  NFL Star Helps Young Fan Prove Friendship With...   connor_finnegan   

                                               topic  \
0  Asteroid, Asteroids, challenge, Earth, Space, ...   
1  Apps and Software, Google, open source, opn pl...   
2  Entertainment, NFL, NFL Draft, Sports, Television   
3                 Sports, Video, Videos, Watercooler   
4  Entertainment, instagram, instagram video, NFL...   

                              time  \
0  Wed, 19 Jun 2013 15:04:30 +0000   
1  Thu, 28 Mar 2013 17:40:55 +0000   
2  Wed, 07 May 2014 19:15:20 +0000   
3  Fri, 11 Oct 2013 02:26:50 +0000   
4  T

In [3]:
from datetime import datetime
import pandas as pd

# 定义一个函数来细分时间
def split_time(time_str, previous_date=None):
    # 如果时间字符串是 None 或 'none'，使用之前的日期
    if time_str is None or time_str.lower() == 'none':
        return previous_date if previous_date is not None else (None, None, None, None, None)
    
    # 解析时间字符串
    dt = datetime.strptime(time_str, '%a, %d %b %Y %H:%M:%S %z')
    return dt.year, dt.month, dt.day, dt.hour, dt.weekday() + 1

# 定义一个辅助函数来处理整个数据集
def process_time_data(data):
    previous_date = None  # 储存上一笔文章的日期
    time_parts = []  # 储存细分后的时间部分

    for index, (time_str, id_value) in enumerate(zip(data['time'], data['Id'])):
        if time_str is None or time_str.lower() == 'none':  # 如果时间字符串是 None 或 'none'
            if previous_date is not None:
                time_parts.append(previous_date)
            else:
                time_parts.append((None, None, None, None, None))  # 如果也没有上一笔日期
            print(f"Encountered None for ID: {id_value}")  # 打印当前 ID
        else:
            # 解析时间并获得日期部分
            year, month, day, hour, weekday = split_time(time_str)
            time_parts.append((year, month, day, hour, weekday))
            previous_date = (year, month, day, hour, weekday)  # 更新上一笔日期

    # 将时间部分转换为 DataFrame
    return pd.DataFrame(time_parts, columns=['T_year', 'T_month', 'T_day', 'T_hour', 'T_weekday'])

# 应用细分时间函数
result_time_data = process_time_data(result)
result_test_time_data = process_time_data(result_test)

# 将细分后的时间数据添加到原始数据中
result[['T_year', 'T_month', 'T_day', 'T_hour', 'T_weekday']] = result_time_data
result_test[['T_year', 'T_month', 'T_day', 'T_hour', 'T_weekday']] = result_test_time_data


# 显示处理后的前10笔结果
result.head()

Encountered None for ID: 29228


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result[['T_year', 'T_month', 'T_day', 'T_hour', 'T_weekday']] = result_time_data


Unnamed: 0,Id,title,author,topic,time,cleaned_content,T_year,T_month,T_day,T_hour,T_weekday
0,0,NASA's Grand Challenge: Stop Asteroids From De...,clara_moskowitz,"Asteroid, Asteroids, challenge, Earth, Space, ...","Wed, 19 Jun 2013 15:04:30 +0000",Clara Moskowitz for Space.com 2013-06-19 15:0...,2013,6,19,15,3
1,1,Google's New Open Source Patent Pledge: We Won...,christina_warren,"Apps and Software, Google, open source, opn pl...","Thu, 28 Mar 2013 17:40:55 +0000",Google's New Open Source Patent Pledge: We Won...,2013,3,28,17,4
2,2,Ballin': 2014 NFL Draft Picks Get to Choose Th...,sam_laird,"Entertainment, NFL, NFL Draft, Sports, Television","Wed, 07 May 2014 19:15:20 +0000",Ballin': 2014 NFL Draft Picks Get to Choose Th...,2014,5,7,19,3
3,3,Cameraperson Fails Deliver Slapstick Laughs,sam_laird,"Sports, Video, Videos, Watercooler","Fri, 11 Oct 2013 02:26:50 +0000",Cameraperson Fails Deliver Slapstick Laughs ...,2013,10,11,2,5
4,4,NFL Star Helps Young Fan Prove Friendship With...,connor_finnegan,"Entertainment, instagram, instagram video, NFL...","Thu, 17 Apr 2014 03:31:43 +0000",NFL Star Helps Young Fan Prove Friendship With...,2014,4,17,3,4


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np

# Step 1: Install NLTK (if not already installed)
# !pip install nltk
# Step 2: Download NLTK data (run this once)
# nltk.download('stopwords')
# Step 3: Define the tokenization function
def tokenize_author(text):
    if type(text) == np.ndarray:
        text = text[0]
    authors = re.split(',', text)
    for idx, author in enumerate(authors):
        authors[idx] = re.sub(' ', '', author)
    return authors


def tokenize(text):
    # Initialize the PorterStemmer
    stemmer = PorterStemmer()
    # Get the list of stopwords
    stop_words = set(stopwords.words('english'))
    # Tokenize the text by splitting on spaces
    words = text.split()
    # Remove stopwords and apply stemming
    tokens = [stemmer.stem(word) for word in words if word.lower() not in stop_words]
    return tokens



In [5]:
# Apply the tokenization function to the 'cleaned_content' column
result['tokens'] = result['cleaned_content'].apply(tokenize)
result_test['tokens'] = result_test['cleaned_content'].apply(tokenize)

# Display the first 10 rows of the result DataFrame
result.head()

Unnamed: 0,Id,title,author,topic,time,cleaned_content,T_year,T_month,T_day,T_hour,T_weekday,tokens
0,0,NASA's Grand Challenge: Stop Asteroids From De...,clara_moskowitz,"Asteroid, Asteroids, challenge, Earth, Space, ...","Wed, 19 Jun 2013 15:04:30 +0000",Clara Moskowitz for Space.com 2013-06-19 15:0...,2013,6,19,15,3,"[clara, moskowitz, space.com, 2013-06-19, 15:0..."
1,1,Google's New Open Source Patent Pledge: We Won...,christina_warren,"Apps and Software, Google, open source, opn pl...","Thu, 28 Mar 2013 17:40:55 +0000",Google's New Open Source Patent Pledge: We Won...,2013,3,28,17,4,"[google', new, open, sourc, patent, pledge:, s..."
2,2,Ballin': 2014 NFL Draft Picks Get to Choose Th...,sam_laird,"Entertainment, NFL, NFL Draft, Sports, Television","Wed, 07 May 2014 19:15:20 +0000",Ballin': 2014 NFL Draft Picks Get to Choose Th...,2014,5,7,19,3,"[ballin':, 2014, nfl, draft, pick, get, choos,..."
3,3,Cameraperson Fails Deliver Slapstick Laughs,sam_laird,"Sports, Video, Videos, Watercooler","Fri, 11 Oct 2013 02:26:50 +0000",Cameraperson Fails Deliver Slapstick Laughs ...,2013,10,11,2,5,"[cameraperson, fail, deliv, slapstick, laugh, ..."
4,4,NFL Star Helps Young Fan Prove Friendship With...,connor_finnegan,"Entertainment, instagram, instagram video, NFL...","Thu, 17 Apr 2014 03:31:43 +0000",NFL Star Helps Young Fan Prove Friendship With...,2014,4,17,3,4,"[nfl, star, help, young, fan, prove, friendshi..."


In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# 假設 result 和 result_test 是已經存在的 DataFrame，且 tokenization 函數已經應用於 cleaned_content
# 例：result['tokens'] = result['cleaned_content'].apply(tokenize)
# result_test['tokens'] = result_test['cleaned_content'].apply(tokenize)

# 將 tokens 列轉換為字符串格式
result['tokens_str'] = result['tokens'].apply(lambda x: ' '.join(x))
result_test['tokens_str'] = result_test['tokens'].apply(lambda x: ' '.join(x))

# 定义 TF-IDF 向量化器
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=20)

# 计算 TF-IDF 值
tfidf_matrix_train = tfidf_vectorizer.fit_transform(result['tokens_str'])
tfidf_matrix_test = tfidf_vectorizer.transform(result_test['tokens_str'])

# 将 TF-IDF 矩阵转换为 DataFrame
tfidf_df_train = pd.DataFrame(tfidf_matrix_train.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df_test = pd.DataFrame(tfidf_matrix_test.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# 为 TF-IDF 特征添加前缀 'TF_'
tfidf_df_train.columns = ['TF_' + col for col in tfidf_df_train.columns]
tfidf_df_test.columns = ['TF_' + col for col in tfidf_df_test.columns]

# 将 TF-IDF 特征添加到原始 DataFrame
result = pd.concat([result, tfidf_df_train], axis=1)
result_test = pd.concat([result_test, tfidf_df_test], axis=1)

# 显示处理后的前 10 行结果
result.head(5)



Unnamed: 0,Id,title,author,topic,time,cleaned_content,T_year,T_month,T_day,T_hour,...,TF_one,TF_said,TF_see,TF_time,TF_topics,TF_use,TF_video,TF_work,TF_world,TF_year
0,0,NASA's Grand Challenge: Stop Asteroids From De...,clara_moskowitz,"Asteroid, Asteroids, challenge, Earth, Space, ...","Wed, 19 Jun 2013 15:04:30 +0000",Clara Moskowitz for Space.com 2013-06-19 15:0...,2013,6,19,15,...,0.111899,0.731597,0.085364,0.123299,0.073995,0.0,0.145548,0.144924,0.129345,0.0
1,1,Google's New Open Source Patent Pledge: We Won...,christina_warren,"Apps and Software, Google, open source, opn pl...","Thu, 28 Mar 2013 17:40:55 +0000",Google's New Open Source Patent Pledge: We Won...,2013,3,28,17,...,0.0,0.0,0.142118,0.205275,0.123191,0.439656,0.0,0.0,0.0,0.0
2,2,Ballin': 2014 NFL Draft Picks Get to Choose Th...,sam_laird,"Entertainment, NFL, NFL Draft, Sports, Television","Wed, 07 May 2014 19:15:20 +0000",Ballin': 2014 NFL Draft Picks Get to Choose Th...,2014,5,7,19,...,0.143203,0.0,0.109245,0.394481,0.047348,0.0,0.186265,0.0,0.082765,0.17066
3,3,Cameraperson Fails Deliver Slapstick Laughs,sam_laird,"Sports, Video, Videos, Watercooler","Fri, 11 Oct 2013 02:26:50 +0000",Cameraperson Fails Deliver Slapstick Laughs ...,2013,10,11,2,...,0.0,0.0,0.105251,0.304046,0.091234,0.0,0.897276,0.0,0.159478,0.0
4,4,NFL Star Helps Young Fan Prove Friendship With...,connor_finnegan,"Entertainment, instagram, instagram video, NFL...","Thu, 17 Apr 2014 03:31:43 +0000",NFL Star Helps Young Fan Prove Friendship With...,2014,4,17,3,...,0.012333,0.0,0.018816,0.0,0.008155,0.0,0.032083,0.0,0.085533,0.0


In [7]:
from nltk.stem import WordNetLemmatizer
# nltk.download('stopwords')
stop = stopwords.words('english')
stop.extend([',',':','.','(',')',"'",'nt','>','<','?','-','_','*','%',';','~','`','``','--','[',']','[]',"'s",'also','imag','courtesi'])


def tokenizer(text):
    if type(text) == np.ndarray:
        text = text[0]
    return re.split(r'\s+', text.strip())

def tokenizer_stem_lemma_nostop(text):
    clean_tokens = []
    # Remove stopwords
    for token in text:
        if token not in stop:
            clean_tokens.append(token)

    # PorterStemmer
    porter = PorterStemmer()
    clean_tokens_porter = [porter.stem(w) for w in clean_tokens]

    # Lemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    clean_tokens_lemmatizer = [wordnet_lemmatizer.lemmatize(w, pos="v") for w in clean_tokens_porter]

    return(clean_tokens_lemmatizer)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import pandas as pd
from sklearn.compose import ColumnTransformer

# 假設 result 和 result_test DataFrame 已經包含 TF-IDF 特徵

# 移除不需要的列
columns_to_drop = ['Id', 'cleaned_content', 'time', 'tokens', 'tokens_str']
result = result.drop(columns=columns_to_drop)
columns_to_drop = [ 'cleaned_content', 'time', 'tokens', 'tokens_str']
result_test = result_test.drop(columns=columns_to_drop)

trans_att = ColumnTransformer(
    [('Author', CountVectorizer(tokenizer=tokenizer, lowercase=False, max_features=5), [1]),
     ('Topic', CountVectorizer(tokenizer=tokenizer_stem_lemma_nostop, lowercase=False, max_features=5), [2]),
     ('Title', CountVectorizer(tokenizer=tokenizer_stem_lemma_nostop, lowercase=False, max_features=5), [0])],
    n_jobs=-1,
    remainder='passthrough'
)
y = (df_train['Popularity'].values == 1).astype(int)

In [9]:
result.head(5)
result.columns

Index(['title', 'author', 'topic', 'T_year', 'T_month', 'T_day', 'T_hour',
       'T_weekday', 'TF_2014', 'TF_also', 'TF_app', 'TF_first', 'TF_get',
       'TF_imag', 'TF_image', 'TF_like', 'TF_make', 'TF_new', 'TF_one',
       'TF_said', 'TF_see', 'TF_time', 'TF_topics', 'TF_use', 'TF_video',
       'TF_work', 'TF_world', 'TF_year'],
      dtype='object')

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()



# columns_to_scale = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
# result[:, columns_to_scale] = sc.fit_transform(result[:, columns_to_scale])
# columns_to_scale2 = [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
# result_test[:, columns_to_scale] = sc.transform(result_test[:, columns_to_scale2])

X_train_train, X_valid_valid, y_train_train, y_valid_valid = train_test_split( result, y, test_size=0.2, random_state=0)

In [11]:
result.head()

Unnamed: 0,title,author,topic,T_year,T_month,T_day,T_hour,T_weekday,TF_2014,TF_also,...,TF_one,TF_said,TF_see,TF_time,TF_topics,TF_use,TF_video,TF_work,TF_world,TF_year
0,NASA's Grand Challenge: Stop Asteroids From De...,clara_moskowitz,"Asteroid, Asteroids, challenge, Earth, Space, ...",2013,6,19,15,3,0.0,0.4942,...,0.111899,0.731597,0.085364,0.123299,0.073995,0.0,0.145548,0.144924,0.129345,0.0
1,Google's New Open Source Patent Pledge: We Won...,christina_warren,"Apps and Software, Google, open source, opn pl...",2013,3,28,17,4,0.0,0.137128,...,0.0,0.0,0.142118,0.205275,0.123191,0.439656,0.0,0.0,0.0,0.0
2,Ballin': 2014 NFL Draft Picks Get to Choose Th...,sam_laird,"Entertainment, NFL, NFL Draft, Sports, Television",2014,5,7,19,3,0.226123,0.052704,...,0.143203,0.0,0.109245,0.394481,0.047348,0.0,0.186265,0.0,0.082765,0.17066
3,Cameraperson Fails Deliver Slapstick Laughs,sam_laird,"Sports, Video, Videos, Watercooler",2013,10,11,2,5,0.0,0.101555,...,0.0,0.0,0.105251,0.304046,0.091234,0.0,0.897276,0.0,0.159478,0.0
4,NFL Star Helps Young Fan Prove Friendship With...,connor_finnegan,"Entertainment, instagram, instagram video, NFL...",2014,4,17,3,4,0.019474,0.009078,...,0.012333,0.0,0.018816,0.0,0.008155,0.0,0.032083,0.0,0.085533,0.0


In [12]:
y

array([0, 1, 1, ..., 0, 0, 1])

In [13]:
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score

# lgbm = Pipeline([('ct', trans_att),
#                  ('clf', LGBMClassifier(force_row_wise=True, random_state=0, learning_rate=0.005, n_estimators=500, verbose=2))])

# CV
# 對完整的訓練數據進行訓練
# print('[auc (10-fold cv)]')
# scores = cross_val_score(estimator=lgbm, X=result, y=y, cv=2, scoring='roc_auc', n_jobs=1, error_score='raise')
# print(f'LGBMClassifier: {scores.mean():.4f} (+/-{scores.std():.4f})')

# # 同時獲取訓練集的分數和估計器實例
# # scores = cross_validate(estimator=clf, X=X_train_all, y=y_train_all, cv =10,scoring='roc_auc', \
# #                             return_train_score=True, return_estimator=True)
# # print(f"train score: {np.mean(scores['train_score']):.4f} (+/-{np.std(scores['train_score']):.4f}")
# # print(f"valid score: {np.mean(scores['test_score']):.4f} (+/-{np.std(scores['test_score']):.4f}")

# # 切成訓練集和驗證集分數
# lgbm.fit(X_train_train, y_train_train)
# print(f'train scroe: {roc_auc_score(y_train_train, lgbm.predict_proba(X_train_train)[:, 1]):.4f}')
# print(f'valid score: {roc_auc_score(y_valid_valid, lgbm.predict_proba(X_valid_valid)[:, 1]):.4f}')

In [14]:

vectorizer_author = CountVectorizer(tokenizer=tokenizer, lowercase=False)
vectorizer_topic = CountVectorizer(tokenizer=tokenize, lowercase=False)
vectorizer_title = CountVectorizer(tokenizer=tokenize, lowercase=False)
# vectorizer_author = CountVectorizer(tokenizer=tokenizer, lowercase=False, max_features=5)
# vectorizer_topic = CountVectorizer(tokenizer=tokenize, lowercase=False, max_features=5)
# vectorizer_title = CountVectorizer(tokenizer=tokenize, lowercase=False, max_features=5)
X_author = vectorizer_author.fit_transform(result.iloc[:, 1])  # 第1列: Author
X_topic = vectorizer_topic.fit_transform(result.iloc[:, 2])    # 第2列: Topic
X_title = vectorizer_title.fit_transform(result.iloc[:, 0])    # 第0列: Title


X_T_author = vectorizer_author.transform(result_test.iloc[:, 1])  # 第1列: Author
X_T_topic = vectorizer_topic.transform(result_test.iloc[:, 2])    # 第2列: Topic
X_T_title = vectorizer_title.transform(result_test.iloc[:, 0])    # 第0列: Title



In [15]:
result.iloc[:, 1]

0                      clara_moskowitz
1                     christina_warren
2                            sam_laird
3                            sam_laird
4                      connor_finnegan
                     ...              
27638    lorenzo_franceschi-bicchierai
27639                   adario_strange
27640               christine_erickson
27641                   seth_fiegerman
27642                     megan_ranney
Name: author, Length: 27643, dtype: object

In [16]:
print(vectorizer_author.get_feature_names_out())
print(vectorizer_author.vocabulary_)

['#makefurhistory' '2machines' 'a_wolfe' ... 'ziv_eliraz' 'zoe_fox'
 'äkta_design_studio']
{'clara_moskowitz': 273, 'christina_warren': 260, 'sam_laird': 1062, 'connor_finnegan': 285, 'brendan_greeley': 180, 'brian_anthony_hernandez': 185, 'sandra_gonzalez': 1073, 'sara_afzal': 1074, 'jason_abbruzzese': 524, 'amanda_wills': 77, 'andrea_romano': 91, 'stan_schroeder': 1152, 'seth_fiegerman': 1111, 'jonathan_ellis': 599, 'todd_wasserman': 1235, 'lorenzo_franceschi-bicchierai': 763, 'christine_erickson': 261, 'tariq_malik': 1195, 'christopher_miller': 264, 'max_knoblauch': 831, 'emily_chow': 395, 'lance_ulanoff': 700, 'cheri_warren': 248, 'megan_specia': 839, 'gabe_bergado': 440, 'jesse_emspak': 554, 'alex_hazlett': 42, 'brian_ries': 195, 'jenni_ryall': 541, 'neha_prakash': 901, 'kari_paul': 638, 'kurt_wagner': 694, 'christina_ascani': 256, 'nina_frazier_hansen': 918, 'rebecca_hiscott': 1007, 't.l._stanley': 1188, 'karissa_bell': 640, 'chelsea_stark': 246, 'chris_taylor': 254, 'kelsey_juka

In [17]:
print(X_author)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 28225 stored elements and shape (27643, 1291)>
  Coords	Values
  (0, 273)	1
  (1, 260)	1
  (2, 1062)	1
  (3, 1062)	1
  (4, 285)	1
  (5, 180)	1
  (6, 185)	1
  (7, 1073)	1
  (8, 1074)	1
  (9, 524)	1
  (10, 77)	1
  (11, 91)	1
  (12, 1152)	1
  (13, 1111)	1
  (14, 1062)	1
  (15, 599)	1
  (16, 1235)	1
  (17, 91)	1
  (18, 1235)	1
  (19, 1111)	1
  (20, 763)	1
  (21, 261)	1
  (22, 1195)	1
  (23, 264)	1
  (24, 831)	1
  :	:
  (27618, 1120)	1
  (27619, 260)	1
  (27620, 1073)	1
  (27621, 929)	1
  (27622, 1157)	1
  (27623, 1289)	1
  (27624, 277)	1
  (27625, 95)	1
  (27626, 717)	1
  (27627, 614)	1
  (27628, 91)	1
  (27629, 763)	1
  (27630, 261)	1
  (27631, 1152)	1
  (27632, 344)	1
  (27633, 541)	1
  (27634, 165)	1
  (27635, 524)	1
  (27636, 713)	1
  (27637, 185)	1
  (27638, 763)	1
  (27639, 11)	1
  (27640, 261)	1
  (27641, 1111)	1
  (27642, 838)	1


In [18]:
print(vectorizer_topic.get_feature_names_out())
print(vectorizer_topic.vocabulary_)

['"el' '"i' '#1connection,' ... 'zunzuneo' 'zynga' 'zz']


In [19]:
print(X_topic)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 209078 stored elements and shape (27643, 12989)>
  Coords	Values
  (0, 862)	1
  (0, 863)	1
  (0, 2092)	1
  (0, 3550)	1
  (0, 10679)	1
  (0, 11907)	1
  (0, 12753)	1
  (1, 712)	1
  (1, 10604)	1
  (1, 4821)	1
  (1, 8213)	1
  (1, 10665)	1
  (1, 8228)	1
  (1, 8826)	1
  (1, 8494)	1
  (1, 6504)	1
  (1, 11445)	1
  (1, 10603)	1
  (1, 8496)	1
  (1, 11330)	1
  (1, 11906)	1
  (2, 3782)	1
  (2, 7913)	1
  (2, 7912)	1
  (2, 3397)	1
  :	:
  (27639, 3125)	1
  (27639, 9897)	1
  (27639, 12508)	2
  (27639, 12287)	1
  (27639, 12368)	1
  (27639, 11335)	1
  (27639, 9375)	1
  (27639, 5104)	1
  (27640, 12469)	1
  (27640, 5491)	1
  (27640, 8685)	1
  (27640, 4310)	1
  (27640, 5424)	1
  (27640, 3316)	1
  (27641, 1776)	1
  (27641, 7209)	1
  (27641, 7045)	1
  (27641, 7155)	1
  (27641, 12840)	1
  (27641, 10928)	1
  (27642, 1776)	2
  (27642, 10497)	1
  (27642, 10855)	1
  (27642, 2857)	1
  (27642, 934)	1


In [20]:
print(vectorizer_title.get_feature_names_out())
print(vectorizer_title.vocabulary_)

['"bends"' '"best' '"i' ... '‘the' '‘wheel' '…']


In [21]:
print(X_title)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 181550 stored elements and shape (27643, 25419)>
  Coords	Values
  (0, 16286)	1
  (0, 11678)	1
  (0, 6772)	1
  (0, 21868)	1
  (0, 4660)	1
  (0, 8590)	1
  (0, 9367)	1
  (1, 11586)	1
  (1, 16461)	1
  (1, 17043)	1
  (1, 21394)	1
  (1, 17537)	1
  (1, 18056)	1
  (1, 22113)	1
  (1, 23776)	1
  (1, 4718)	1
  (1, 10549)	1
  (2, 5012)	1
  (2, 2962)	1
  (2, 16512)	1
  (2, 9070)	1
  (2, 17865)	1
  (2, 11327)	1
  (2, 6990)	1
  (2, 24370)	1
  :	:
  (27639, 19043)	1
  (27639, 12195)	1
  (27639, 24213)	1
  (27639, 14305)	1
  (27639, 19858)	1
  (27640, 8910)	1
  (27640, 22471)	1
  (27640, 2779)	1
  (27640, 12245)	1
  (27640, 6463)	1
  (27640, 10896)	1
  (27641, 19273)	1
  (27641, 9361)	1
  (27641, 5235)	1
  (27641, 25144)	1
  (27641, 18489)	1
  (27641, 7706)	1
  (27641, 9815)	1
  (27642, 22606)	1
  (27642, 24855)	1
  (27642, 20144)	1
  (27642, 7621)	1
  (27642, 23145)	1
  (27642, 26)	1
  (27642, 4781)	1


In [22]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
import pandas as pd
from sklearn.metrics import roc_auc_score

# 將新的特徵合併
X_new = hstack([X_author, X_topic, X_title])  # 將新特徵合併

# 使用新特徵替換原 DataFrame 的特徵
result = result.drop(columns=['author', 'topic', 'title'])  # 刪除原有的特徵
result = pd.concat([result.reset_index(drop=True), pd.DataFrame(X_new.toarray())], axis=1)  # 將新特徵加入



# 將數據分為訓練集和測試集
X_train_train, X_test_test, y_train_train, y_test_test = train_test_split(result, y, test_size=0.2, random_state=42)

# 初始化並訓練 LGBMClassifier
# model = LGBMClassifier()
model = LGBMClassifier(
    learning_rate=0.005,
    n_estimators=500
)
X_train_train = X_train_train.astype('float32')
X_test_test = X_test_test.astype('float32')
model.fit(X_train_train, y_train_train)

# 使用 predict_proba 預測測試集的概率分布
y_pred_proba = model.predict_proba(X_test_test)[:, 1]

# 計算 AUC
auc = roc_auc_score(y_test_test, y_pred_proba)
print(f'AUC: {auc:.4f}')

[LightGBM] [Info] Number of positive: 10916, number of negative: 11198
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053712 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10640
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 2406
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493624 -> initscore=-0.025506
[LightGBM] [Info] Start training from score -0.025506
AUC: 0.5925


In [23]:
result.head()

Unnamed: 0,T_year,T_month,T_day,T_hour,T_weekday,TF_2014,TF_also,TF_app,TF_first,TF_get,...,39689,39690,39691,39692,39693,39694,39695,39696,39697,39698
0,2013,6,19,15,3,0.0,0.4942,0.0,0.136595,0.132908,...,0,0,0,0,0,0,0,0,0,0
1,2013,3,28,17,4,0.0,0.137128,0.299905,0.454819,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2014,5,7,19,3,0.226123,0.052704,0.0,0.262211,0.680357,...,0,0,0,0,0,0,0,0,0,0
3,2013,10,11,2,5,0.0,0.101555,0.0,0.0,0.16387,...,0,0,0,0,0,0,0,0,0,0
4,2014,4,17,3,4,0.019474,0.009078,0.0,0.0,0.029296,...,0,0,0,0,0,0,0,0,0,0


In [25]:
y_pred_proba

array([0.61615806, 0.46807672, 0.51200914, ..., 0.51036046, 0.47545985,
       0.4905277 ])

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import pandas as pd

# 假设模型已经训练好，并且 result_test 已经包含了所有需要的特征

# 将新的特征合并
X_new_test = hstack([X_T_author, X_T_topic, X_T_title])  # 将新特征合并

# 使用新特征替换原 DataFrame 的特征
result_test = result_test.drop(columns=['author', 'topic', 'title'])  # 删除原有的特征
result_test = pd.concat([result_test.reset_index(drop=True), pd.DataFrame(X_new_test.toarray())], axis=1)  # 将新特征加入

X_test = result_test.drop(columns=['Id'])
# 使用训练好的模型对测试集进行预测
y_pred_proba_test = model.predict_proba(X_test)[:, 1]

# 创建一个 DataFrame 保存预测结果
predictions = pd.DataFrame({
    'Id': result_test['Id'],
    'Predicted': y_pred_proba_test
})

# 将预测结果保存为 CSV 文件
predictions.to_csv('predictions.csv', index=False)

print("Predictions saved to predictions.csv")