In [8]:
from skimage import io
import os
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import math
import numpy as np
from skimage import exposure, img_as_float

In [9]:
profile_image_path = './pro_img/'


def inspect_dataset(df_data):
    """pytoho
        查看加载的数据基本信息
    """
    print('数据集基本信息：')
    print(df_data.info())
    print('数据集有{}行，{}列'.format(df_data.shape[0], df_data.shape[1]))
    print('数据预览:')
    print(df_data.head())


def check_profile_image(img_link):
    """
        判断头像图片链接是否有效
        如果有效，下载到本地，并且返回保存路径
    """
    save_image_path = ''
    # 有效的图片扩展名
    valid_img_ext_lst = ['.jpeg', '.png', '.jpg']

    try:
        img_data = io.imread(img_link)
        image_name = img_link.rsplit('/')[-1]
        if any(valid_img_ext in image_name.lower() for valid_img_ext in valid_img_ext_lst):
            # 确保图片文件包含有效的扩展名
            save_image_path = os.path.join(profile_image_path, image_name)
            io.imsave(save_image_path, img_data)
    except:
        print('头像链接 {} 无效'.format(img_link))

    return save_image_path

def clean_text(text):
    """
        清洗文本数据
    """
    # just in case
    text = text.lower()

    # 去除特殊字符
    text = re.sub('\s\W', ' ', text)
    text = re.sub('\W\s', ' ', text)
    text = re.sub('\s+', ' ', text)

    return text


def split_train_test(df_data, size=0.8):
    """
        分割训练集和测试集
    """
    # 为保证每个类中的数据能在训练集中和测试集中的比例相同，所以需要依次对每个类进行处理
    df_train = pd.DataFrame()
    df_test = pd.DataFrame()

    labels = [0, 1]
    for label in labels:
        # 找出gender的记录
        text_df_w_label = df_data[df_data['label'] == label]
        # 重新设置索引，保证每个类的记录是从0开始索引，方便之后的拆分
        text_df_w_label = text_df_w_label.reset_index()

        # 默认按80%训练集，20%测试集分割
        # 这里为了简化操作，取前80%放到训练集中，后20%放到测试集中
        # 当然也可以随机拆分80%，20%（尝试实现下DataFrame中的随机拆分）

        # 该类数据的行数
        n_lines = text_df_w_label.shape[0]
        split_line_no = math.floor(n_lines * size)
        text_df_w_label_train = text_df_w_label.iloc[:split_line_no, :]
        text_df_w_label_test = text_df_w_label.iloc[split_line_no:, :]

        # 放入整体训练集，测试集中
        df_train = df_train.append(text_df_w_label_train)
        df_test = df_test.append(text_df_w_label_test)

    df_train = df_train.reset_index()
    df_test = df_test.reset_index()
    return df_train, df_test


def get_word_list_from_data(text_s):
    """
        将数据集中的单词放入到一个列表中
    """
    word_list = []
    for _, text in text_s.iteritems():
        word_list += text.split(' ')
    return word_list


def proc_text(text):
    """
        分词+去除停用词
    """
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(filtered_words)



def extract_tf_idf(text_s, text_collection, common_words_freqs):
    """
        提取tf-idf特征
    """
    # 这里只选择TF-IDF特征作为例子
    # 可考虑使用词频或其他文本特征作为额外的特征

    n_sample = text_s.shape[0]
    n_feat = len(common_words_freqs)

    common_words = [word for word, _ in common_words_freqs]

    # 初始化
    X = np.zeros([n_sample, n_feat])

    print('提取tf-idf特征...')
    for i, text in text_s.iteritems():
        feat_vec = []
        for word in common_words:
            if word in text:
                # 如果在高频词中，计算TF-IDF值
                tf_idf_val = text_collection.tf_idf(word, text)
            else:
                tf_idf_val = 0

            feat_vec.append(tf_idf_val)

        # 赋值
        X[i, :] = np.array(feat_vec)

    return X


def hex_to_rgb(value):
    """
        十六进制颜色码转换为RGB值
    """
    rgb_list = list(int(value[i:i + 2], 16) for i in range(0, 6, 2))
    return rgb_list


def extract_rgb_feat(hex_color_s):
    """
         从十六进制颜色码中提取RGB值作为特征
    """
    n_sample = hex_color_s.shape[0]
    n_feat = 3

    # 初始化
    X = np.zeros([n_sample, n_feat])

    print('提取RGB特征...')
    for i, hex_val in hex_color_s.iteritems():
        feat_vec = hex_to_rgb(hex_val)

        # 赋值
        X[i, :] = np.array(feat_vec)

    return X


def extract_rgb_hist_feat(img_path_s):
    """
        从图像中提取RGB直方图特征
    """
    n_sample = img_path_s.shape[0]
    n_bins = 100    # 每个通道bin的个数
    n_feat = n_bins * 3

    # 初始化
    X = np.zeros([n_sample, n_feat])

    print('提取RGB直方图特征...')
    for i, img_path in img_path_s.iteritems():
        # 加载图像
        img_data = io.imread(img_path)
        img_data = img_as_float(img_data)

        if img_data.ndim == 3:
            # 3个通道
            hist_r, _ = exposure.histogram(img_data[:, :, 0], nbins=n_bins)
            hist_g, _ = exposure.histogram(img_data[:, :, 1], nbins=n_bins)
            hist_b, _ = exposure.histogram(img_data[:, :, 2], nbins=n_bins)
        else:
            # 2个通道
            hist, _ = exposure.histogram(img_data, nbins=n_bins)
            hist_r = hist.copy()
            hist_g = hist.copy()
            hist_b = hist.copy()

        feat_vec = np.concatenate((hist_r, hist_b, hist_g))

        # 赋值
        X[i, :] = np.array(feat_vec)

    return X

In [10]:
import zipfile
def unzip(zip_filepath, dest_path):
    """
        解压zip文件
    """
    with zipfile.ZipFile(zip_filepath) as zf:
        zf.extractall(path=dest_path)


def get_dataset_filename(zip_filepath):
    """
        获取数据库文件名
    """
    with zipfile.ZipFile(zip_filepath) as zf:
        return zf.namelist()[0]


def cal_acc(true_labels, pred_labels):
    """
        计算准确率
    """
    n_total = len(true_labels)
    correct_list = [true_labels[i] == pred_labels[i] for i in range(n_total)]

    acc = sum(correct_list) / n_total
    return acc



In [11]:
import os
import pandas as pd
import nltk
from nltk.text import TextCollection
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

In [12]:
dataset_path = './dataset'  # 数据集路径
zip_filename = 'twitter-user-gender-classification.zip'  # zip文件名
zip_filepath = os.path.join(dataset_path, zip_filename)  # zip文件路径
cln_datapath = './cln_data.csv'     # 清洗好的数据路径
# 是否第一次运行
is_first_run = False

In [20]:
"""                                                                                               zx
        主函数
    """
dataset_filename = get_dataset_filename(zip_filepath)  # 数据集文件名（在zip中）
dataset_filepath = os.path.join(dataset_path, dataset_filename)  # 数据集文件路径
if is_first_run:
    print('解压zip...')
    unzip(zip_filepath, dataset_path)
    print('完成.')

In [21]:
data = pd.read_csv(dataset_filepath, encoding='latin1',
                           usecols=['gender', 'description', 'link_color',
                                    'profileimage', 'sidebar_color', 'text'])
inspect_dataset(data)

数据集基本信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20050 entries, 0 to 20049
Data columns (total 6 columns):
gender           19953 non-null object
description      16306 non-null object
link_color       20050 non-null object
profileimage     20050 non-null object
sidebar_color    20050 non-null object
text             20050 non-null object
dtypes: object(6)
memory usage: 939.9+ KB
None
数据集有20050行，6列
数据预览:
   gender                                        description link_color  \
0    male                              i sing my own rhythm.     08C2C2   
1    male  I'm the author of novels filled with family dr...     0084B4   
2    male                louis whining and squealing and all     ABB8C2   
3    male  Mobile guy.  49ers, Shazam, Google, Kleiner Pe...     0084B4   
4  female  Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...     3B94D9   

                                        profileimage sidebar_color  \
0  https://pbs.twimg.com/profile_images/414342229...        F

In [22]:
filtered_data = data[(data['gender'] == 'male') | (data['gender'] == 'female')]

        # 2.2 过滤掉 'description' 列为空的数据
filtered_data = filtered_data.dropna(subset=['description'])

        # 2.3 过滤掉 'link_color' 列和 'sidebar_color' 列非法的16进制数据
filtered_data = filtered_data[filtered_data['link_color'].str.len() == 6]
filtered_data = filtered_data[filtered_data['sidebar_color'].str.len() == 6]

        # 2.4 清洗文本数据
print('清洗文本数据...')
cln_desc = filtered_data['description'].apply(clean_text)
cln_text = filtered_data['text'].apply(clean_text)
filtered_data['cln_desc'] = cln_desc
filtered_data['cln_text'] = cln_text

清洗文本数据...


In [23]:
  # 保存处理好的数据
filtered_data.to_csv(cln_datapath, index=False, encoding='latin1')
   # 读取处理好的数据
clean_data = pd.read_csv(cln_datapath, encoding='latin1',
                             usecols=['gender', 'cln_desc', 'cln_text',
                                      'link_color', 'sidebar_color'])

    # 查看label的分布
print(clean_data.groupby('gender').size())

gender
female    3961
male      3908
dtype: int64


In [24]:
clean_data.loc[clean_data['gender'] == 'male', 'label'] = 0
clean_data.loc[clean_data['gender'] == 'female', 'label'] = 1

    # 3. 分割数据集
    # 分词 去除停用词
proc_desc_s = clean_data['cln_desc'].apply(proc_text)
clean_data['desc_words'] = proc_desc_s
proc_text_s = clean_data['cln_text'].apply(proc_text)
clean_data['text_words'] = proc_text_s
df_train, df_test = split_train_test(clean_data)
    # 查看训练集测试集基本信息
print('训练集中各类的数据个数：', df_train.groupby('label').size())
print('测试集中各类的数据个数：', df_test.groupby('label').size())

训练集中各类的数据个数： label
0.0    3126
1.0    3168
dtype: int64
测试集中各类的数据个数： label
0.0    782
1.0    793
dtype: int64


In [29]:
    # 4. 特征工程
    # 4.1 训练数据特征提取
print('训练样本特征提取：')
    # 4.1.1 文本数据
    # description数据
print('统计description词频...')
n = 50
from nltk import FreqDist
freq_dist = FreqDist(proc_desc_s)
desc_most_common_words = freq_dist.most_common(n)
print(desc_most_common_words)

训练样本特征提取：
统计description词频...
[('', 34), ('cos even old girl best friend still mum', 18), ('baby could enough', 10), ('ï _ _ùi really love one direction _ù _ ï', 7), ('love', 7), ('twitter bot tweeting english posts francoisgoube ceo cogniteev oncrawl docidohq', 6), ('_ ü sometime last night _ ü r5 world _ ü four 1d made _ ü nialler _ ü', 6), ('like secret little rendezvous perfect j aime dylan et zayn', 6), ('czech adult model pornographic actress 42 inch natural breasts currently livin london follow retweet every day luv u', 5), ('mapping geek ecologist data lover use twitterfeed autotweet uk wildlife flickr pics use twitter widgets html expanded pics practice id', 5), ('run', 5), ('datpiff promo hotnewhiphop promo reverbnation promo soundcloud best net', 5), ('happy', 4), ('best bio ever fuck', 4), ('must 18 view website follow mention retweets check hottest girls _á http co x1mesztesn âé', 4), ('ashley madison open looking turns profile captions member information', 4), ('greek dire

In [32]:
print('统计description词频...')
n_desc_common_words = 50
desc_words_in_train = get_word_list_from_data(df_train['desc_words'])
fdisk = nltk.FreqDist(desc_words_in_train)
desc_common_words_freqs = fdisk.most_common(n_desc_common_words)
print('descriptino中出现最多的{}个词是：'.format(n_desc_common_words))
for word, count in desc_common_words_freqs:
    print('{}: {}次'.format(word, count))
print()

统计description词频...
descriptino中出现最多的50个词是：
_: 523次
co: 483次
û: 464次
ü: 462次
love: 458次
life: 353次
http: 294次
fan: 216次
music: 193次
writer: 192次
lover: 182次
like: 182次
ï: 173次
one: 153次
world: 150次
follow: 146次
ig: 145次
god: 143次
êû: 137次
live: 136次
time: 134次
ª: 133次
sports: 128次
com: 123次
instagram: 123次
https: 116次
things: 109次
make: 109次
best: 109次
girl: 106次
2: 105次
family: 104次
good: 104次
artist: 101次
snapchat: 99次
author: 98次
people: 97次
man: 96次
ªá: 95次
get: 95次
new: 95次
twitter: 94次
blogger: 93次
18: 92次
student: 91次
im: 90次
never: 89次
business: 87次
4: 87次
always: 87次



In [None]:
from nltk.text import TextCollection
word = 'That'
tc = TextCollection([text1, text2, text3,text4, text5])
tf_idf_val = tc.tf_idf(word, new_text)
print('{}的TF-IDF值为：{}'.format(word, tf_idf_val))
for word, count in desc_common_words_freqs:
    print('{}: {}次'.format(word, count))