In [37]:
import pandas as pd 
import pytz
from datetime import datetime

from sklearn.feature_extraction.text import CountVectorizer


In [38]:
df = pd.read_csv('train_half_with_sentiment_withemoji.csv')

In [39]:
df.drop(columns=['author','topic','post_url','scrape_time'],inplace=True)


In [40]:
df['post_time'].dtype

dtype('O')

In [41]:
# Define function to assign time-of-day codes
def time_of_day(hour):
    if 0 <= hour < 5:
        return 0  # Midnight
    elif 5 <= hour < 12:
        return 1  # Morning
    elif 12 <= hour < 18:
        return 2  # Afternoon
    else:
        return 3  # Night

def tw_time(utc_time):
    taipei = pytz.timezone("Asia/Taipei")

    if not isinstance(utc_time, str):
        return pd.NaT

    # Remove any weird invisible characters
    utc_time = utc_time.strip().replace('\u3000', '').replace('\xa0', '')

    try:
        # Try Chinese format: "2025年04月27日 02:25"
        dt = datetime.strptime(utc_time, "%Y年%m月%d日 %H:%M")
        return dt
    except Exception:
        try:
            # Try ISO format: "2025-04-27T09:18:00.000Z"
            dt = pd.to_datetime(utc_time, utc=True)
            return dt.astimezone(taipei).replace(tzinfo=None)
        except Exception as e:
            print(f"Failed to parse: {utc_time}, error: {e}")
            return pd.NaT

            
def time_processing(df):
    # Convert post_time using a custom tw_time function (assumed defined elsewhere)
    df['post_time_origin'] = df['post_time'].apply(tw_time)
    df['scrape_time_origin'] = pd.to_datetime(df['scrape_time_origin'], utc=True).dt.tz_convert('Asia/Taipei').dt.tz_localize(None)
    
    # Safely compute time difference if both columns are datetime
    if pd.api.types.is_datetime64_any_dtype(df['post_time_origin']) and \
       pd.api.types.is_datetime64_any_dtype(df['scrape_time_origin']):
        try:
            df['time_gap'] = df['scrape_time_origin'] - df['post_time_origin']
            df['time_gap'] = df['time_gap'].dt.seconds // 3600
        except Exception as e:
            print(f'Error: {e}')
            
    # Day of week (Monday = 0)
    df["post_weekday"] = df["post_time_origin"].dt.dayofweek

    # Hour of the post
    df["post_hour"] = df["post_time_origin"].dt.hour

    # Convert hour into time-of-day category
    df['time_of_day'] = df['post_hour'].apply(time_of_day)

    return df

def char_length(content):
    try:
        return len(content)
    except Exception as e:
        if isinstance(content, float):
            return 0


def lang_function(lang):
    if lang == 'Ch':
        return'Ch'
    elif lang =='en':
        return 'en'
    else:
        return 'other'


# Apply the function
df.drop_duplicates(subset=['content'])
df = time_processing(df)

df['lang'] = df['lang'].apply(lang_function)
df['post_length'] = df['content'].str.replace(' ', '', regex=False).str.len()
df.drop(columns=['post_time','time_info','emojis','post_hour','content','scrape_time_origin','post_time_origin'],inplace=True)
df

Unnamed: 0,has_photo,has_video,like_count,reply_count,repost_count,share_count,view_count,followers_count,emoji_count,lang,post_weekday,viral,has_question,has_exclaim,sentiment_score,sentiment_label,time_gap,time_of_day,post_length
0,False,False,0,0,0,0,211,85,0,Ch,1,0,False,False,0.699362,neutral,21,3,26
1,True,False,0,0,0,0,0,60,0,Ch,4,0,False,False,0.904839,positive,9,3,388
2,False,False,0,0,0,0,143,1,3,Ch,6,0,False,False,0.648600,neutral,9,3,20
3,False,False,1,0,0,0,3598,24,0,Ch,3,0,False,False,0.745636,neutral,9,1,15
4,False,False,0,0,0,0,410,1,0,Ch,3,0,False,False,0.954807,positive,16,0,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1514,False,False,37,0,0,0,18000,9782,3,Ch,3,1,True,False,0.555900,negative,11,2,64
1515,False,False,714,5,43,8,7067,4762,0,Ch,2,0,False,False,0.647846,neutral,23,0,90
1516,False,False,2,2,0,0,76,12,1,Ch,2,0,False,False,0.685300,neutral,11,2,30
1517,False,False,1,0,0,0,476,5,0,Ch,3,0,True,False,0.792375,positive,8,1,83


In [42]:
list_item = []
for col in df.columns:
    list_item.append([col, df[col].dtype, df[col].isna().sum(),
                      100 * df[col].isna().sum() / len(df[col]),
                      df[col].nunique(), df[col].unique()])
desc_df = pd.DataFrame(data=list_item,
                       columns=['feature', 'data_type', 'null_num', 'null_persen', 'unique_num', 'unique_sample'])
print('test_half')
desc_df


test_half


Unnamed: 0,feature,data_type,null_num,null_persen,unique_num,unique_sample
0,has_photo,bool,0,0.0,2,"[False, True]"
1,has_video,bool,0,0.0,2,"[False, True]"
2,like_count,int64,0,0.0,290,"[0, 1, 17000, 3, 9, 5, 2, 13, 113, 33, 90, 10,..."
3,reply_count,int64,0,0.0,81,"[0, 440, 2, 5, 10, 1, 50, 3, 14, 29, 7, 28, 17..."
4,repost_count,int64,0,0.0,103,"[0, 550, 1, 37, 81, 20, 280, 2, 3, 4, 86, 77, ..."
5,share_count,int64,0,0.0,97,"[0, 6655, 1, 13, 3, 145, 5, 9, 47, 2, 42, 23, ..."
6,view_count,int64,0,0.0,1097,"[211, 0, 143, 3598, 410, 450000, 8270, 652, 38..."
7,followers_count,int64,0,0.0,640,"[85, 60, 1, 24, 152, 328, 647, 0, 380, 59, 148..."
8,emoji_count,int64,0,0.0,14,"[0, 3, 2, 4, 6, 1, 5, 10, 7, 12, 13, 23, 14, 11]"
9,lang,object,0,0.0,1,[Ch]


In [46]:
df = pd.read_csv('test_data.csv')

In [47]:
def segmented(view):
    if view < 1000:
        return 0
    elif 1000 <= view <= 10000:
        return 1
    else:
        return 2
        
df['viral'] = df['view_count'].apply(segmented)
df['viral'].value_counts()

viral
0    745
1    604
2    170
Name: count, dtype: int64

In [48]:
df.to_csv('test_data.csv',index=False)