In [204]:
from typing import List
import pathlib
import bs4 as bs
import re
import os
import pytest 
import glob
import pandas as pd
import chardet
from chardet.universaldetector import UniversalDetector


dataset_folder = 'data'
detector = UniversalDetector()

In [None]:


authorIDs = [3574878, 2845196, 3444474, 3445677, 828046, 4284264, 3498812, 4137740, 3662461, 3363271]
filenames = []



for id in authorIDs:
 filename = glob.glob('blogs/' + str(id) + '.*.xml')
 filenames.append(filename[0])
    
for filename in filenames:
#for filename in glob.glob('blogs/*.xml'):
    print(filename)
    detector.reset()
    for line in open(filename, 'rb'):
        detector.feed(line)
        if detector.done: break
    detector.close()
    print(detector.result['encoding'])

In [226]:
class Post:
    author_number: int
    gender: str
    age: int
    industry: str
    star_sign: str
    date: str  
    post: str

    def to_dict(self):
        return {
            key: getattr(self, key)
            for key in ['author_number', 'gender', 'age', 'industry', 'star_sign', 'date', 'post']
        }

    @staticmethod
    def load_from_file(filename) -> List["Post"]:
        # The last element is the file extension, which we don't care about
        # example file format is: 5144.male.25.indUnk.Scorpio.xml
        age, author_number, gender, industry, star_sign = Post.extract_attributes_from_filename(filename)

        posts = []
        
        encoding = Post.get_encoding(filename);
        xml_source = open(filename, errors='surrogateescape', encoding=encoding).read()
        #xml_source = open(filename, errors='strict', encoding=encoding).read()
        #xml_source = open(filename, errors='replace').read()
        xml_soup = bs.BeautifulSoup(xml_source, "lxml")
        xml_posts = xml_soup.find_all("post")
        xml_dates = xml_soup.find_all("date")
        #print(posts[0].text)
        
        for i in range(0, len(xml_posts)):
            new_post = Post.create_from_attributes(
                author_number, gender, age, industry, star_sign, xml_dates[i].text.strip(), xml_posts[i].text.strip())
            posts.append(new_post)

        return posts

    @staticmethod
    def extract_attributes_from_filename(filename):
        base_filename = pathlib.Path(filename).name  # Get just the filename component
        author_number, gender, age, industry, star_sign, _ = base_filename.split(".")
        author_number = int(author_number)
        age = int(age)
        return age, author_number, gender, industry, star_sign

    @staticmethod
    def get_encoding(filename):
        detector.reset()
        for line in open(filename, 'rb'):
            detector.feed(line)
            if detector.done: break
        detector.close()
        return detector.result['encoding']
    
    @staticmethod
    def create_from_attributes(author_number, gender, age, industry, star_sign, date, post):
        p = Post()
        p.author_number = author_number
        p.gender = gender
        p.age = age
        p.industry = industry
        p.star_sign = star_sign
        p.date = date  
        p.post = post  
        return p

In [227]:
p = Post.load_from_file('data\\3363271.female.27.Student.Virgo.xml')
e = Post.extract_attributes_from_filename('data\\3363271.female.27.Student.Virgo.xml')
c = Post.get_encoding('data\\4137740.female.47.indUnk.Libra.xml')
p = Post.load_from_file('data\\4137740.female.47.indUnk.Libra.xml')
print(p[0].post)
#print(e)
print(c)

Well... I am unsure how to start this new blogging stuff...guess I will just reflect a little and see how it goes.  Have you ever been stuck in a matrix? I have! To me it was like being in a padded, suspended time warp during the early part of  my adult life. A time that lasted over 12 years. That shouldn’t happen to anyone...and shouldn’t have happen to me. But it did. I was 23 , a new single mom...no friends...no job...and not much family to speak of. I had to make money and take care of my son... I couldn’t afford to live on my own so me and my child stayed with my parents... I went back to school and learned a vocation. Then... tried for years and years to make a difference...be somebody and amount to something for my sons sake...the matrix I was in got thicker and harder to escape from...I couldn’t do enough for anyone. My parents, my employer, so called friends, relatives.... it was like total non-acceptance...I felt laughed at and demeaned.  (Like a RedHeaded Step Child.)  I had

In [228]:
filename_id_pattern = re.compile(r"(\d{3,})\..*\..*\..*\..*\.xml")

def get_filename_id(filename):
    # We use search, not match, as we don't care if it's not the whole string
    match = filename_id_pattern.search(filename)
    if match:
        return match.group(1)
    else:
        raise ValueError(f"Could not find an ID in filename {filename}")
        
def get_all_xml_files_in_folder(dataset_folder):
    return glob.glob(os.path.join(dataset_folder, "*.xml"))

def load_dataset_from_raw(dataset_folder):
    all_posts = []
    #print(get_all_xml_files_in_folder(dataset_folder))
    for filename in get_all_xml_files_in_folder(dataset_folder):
        current_posts = Post.load_from_file(filename)
        all_posts.extend(current_posts)
    return all_posts



In [229]:
def save_dataset(all_posts, output_file):
    dataset = pd.DataFrame([post.to_dict() for post in all_posts])
    dataset.to_parquet(output_file, compression='gzip')
    return dataset


def load_dataset(input_file):
    return pd.read_parquet(input_file)

In [None]:
raw_dataset = load_dataset_from_raw(dataset_folder)
save_dataset(raw_dataset, 'posts1.data')
#processed_dataset = load_dataset(filename)


In [118]:
dataset_filename = "posts1.data"
all_posts = load_dataset(dataset_filename)

all_posts.head() 

Unnamed: 0,author_number,gender,age,industry,star_sign,date,post
0,1000331,female,37,indUnk,Leo,"31,May,2004","Well, everyone got up and going this morning. ..."
1,1000331,female,37,indUnk,Leo,"29,May,2004",My four-year old never stops talking. She'll ...
2,1000331,female,37,indUnk,Leo,"28,May,2004","Actually it's not raining yet, but I bought 15..."
3,1000331,female,37,indUnk,Leo,"28,May,2004",Ha! Just set up my RSS feed - that is so easy!...
4,1000331,female,37,indUnk,Leo,"28,May,2004","Oh, which just reminded me, we were talking ab..."


In [198]:
#example_authors = [3574878, 2845196, 3444474, 3445677, 828046, 4284264, 3498812, 4137740, 3662461, 3363271]
example_authors = [828046, 4137740, 3662461 ]

def get_sampled_authors(dataset, sample_authors):
    mask = dataset['author_number'].isin(sample_authors)
    return dataset[mask]


sample = get_sampled_authors(all_posts, example_authors)
    

documents = sample['post'].values
authors = sample['author_number'].values
print(authors) #19 authors

[3662461 3662461 3662461 3662461 3662461 4137740 4137740  828046  828046
  828046  828046  828046  828046  828046  828046  828046  828046  828046
  828046]


In [199]:
from sklearn.model_selection import train_test_split
documents_train, documents_test, authors_train, authors_test = train_test_split(documents, authors)

len(documents_train), len(documents_test), len(authors_train), len(authors_test)



(14, 5, 14, 5)

In [200]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

#preprocessor = TfidfVectorizer(analyzer='char', ngram_range=(2,3))
preprocessor = TfidfVectorizer(analyzer='word')
X_train = preprocessor.fit_transform(documents_train)
print(X_train.shape)
X_test = preprocessor.transform(documents_test)
print(X_test.shape)

model = SGDClassifier()
model.fit(X_train, authors_train)

authors_predicted = model.predict(X_test)

print(authors_test)
print(authors_predicted)

from sklearn.metrics import classification_report
print(classification_report(authors_test, authors_predicted))


(14, 374)
(5, 374)
[3662461 3662461 4137740  828046  828046]
[3662461  828046  828046  828046  828046]
              precision    recall  f1-score   support

      828046       0.50      1.00      0.67         2
     3662461       1.00      0.50      0.67         2
     4137740       0.00      0.00      0.00         1

    accuracy                           0.60         5
   macro avg       0.50      0.50      0.44         5
weighted avg       0.60      0.60      0.53         5



  _warn_prf(average, modifier, msg_start, len(result))


In [201]:
feature_names = preprocessor.get_feature_names()
print(len(feature_names))
dense = X_train.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)


374


In [202]:
print(df[0:100])

          20      2074        60       70s  ability  accessories   achieve  \
0   0.000000  0.000000  0.081919  0.000000  0.00000     0.000000  0.000000   
1   0.000000  0.000000  0.000000  0.000000  0.00000     0.000000  0.000000   
2   0.000000  0.000000  0.000000  0.194517  0.00000     0.000000  0.000000   
3   0.000000  0.000000  0.000000  0.000000  0.00000     0.000000  0.000000   
4   0.000000  0.000000  0.000000  0.000000  0.00000     0.000000  0.000000   
5   0.239248  0.239248  0.000000  0.000000  0.00000     0.000000  0.000000   
6   0.000000  0.000000  0.000000  0.000000  0.00000     0.153067  0.153067   
7   0.000000  0.000000  0.000000  0.000000  0.00000     0.000000  0.000000   
8   0.000000  0.000000  0.000000  0.000000  0.00000     0.000000  0.000000   
9   0.000000  0.000000  0.000000  0.000000  0.00000     0.000000  0.000000   
10  0.000000  0.000000  0.000000  0.000000  0.00000     0.000000  0.000000   
11  0.000000  0.000000  0.000000  0.000000  0.06367     0.000000